Commit 48701b7b authored by Matija Obreza's avatar Matija Obreza
Browse files

Improved internal structures for Most Frequent K Chars

parent d51cf97e
......@@ -37,9 +37,23 @@ public class MostFrequentKChars {
return outputString
* </pre>
*/
public static String mostFrequentKHashing(String string, int k) {
public static String getMostFrequentKHash(String string, int k) {
return toHashString(calculateHash(string, k));
}
/**
* Generate the hash as int[]. Array contains the character (cast to int) followed by the frequency.
*
* Every 2nd element is the character.
*
* @param string input string
* @param k limit result to k most frequent characters
* @return
*/
static int[] calculateHash(String string, int k) {
char[] input = string.toCharArray();
int[] occurrences = new int[string.length()];
int[] hash = new int[2 * k];
// System.err.println("occurences=" + Arrays.toString(occurrences));
// track maximum occurence value
......@@ -57,20 +71,19 @@ public class MostFrequentKChars {
// System.err.println("occurences=" + Arrays.toString(occurrences));
// System.err.println("maxocc=" + maxOcc);
StringBuilder sb = new StringBuilder();
// find at top K occurences
int limit = 0;
int limit = 0, pos = 0;
while (maxOcc > 0 && limit < k) {
int nextOcc = 0;
for (int i = 0; i < occurrences.length; i++) {
if (occurrences[i] == maxOcc) {
sb.append(input[i]).append(occurrences[i]);
// System.err.println("res=" + sb.toString());
hash[pos++] = input[i];
hash[pos++] = occurrences[i];
// System.err.println("pos = " + pos);
if (++limit >= k) {
return sb.toString();
return hash;
}
} else if (occurrences[i] > nextOcc) {
} else if (occurrences[i] < maxOcc && occurrences[i] > nextOcc) {
nextOcc = occurrences[i];
// System.err.println("nextocc=" + nextOcc);
}
......@@ -78,10 +91,19 @@ public class MostFrequentKChars {
maxOcc = nextOcc;
}
return sb.toString();
return hash;
}
/**
* Calculate the similarity of the two hashes
*/
public static int getMostFreqKSimilarity(String hash1, String hash2) {
return getMostFreqKSimilarity(decodeHash(hash1), decodeHash(hash2));
}
/**
* Calculate the similarity of the two hashes
*
* <pre>
int function MostFreqKSimilarity (String inputStr1, String inputStr2, int limit)
def int similarity
......@@ -94,21 +116,23 @@ public class MostFrequentKChars {
// return limit - similarity
return similarity
* </pre>
*
* @param hash1
* @param hash2
* @return
*/
public static int mostFreqKSimilarity(String hash1, String hash2) {
public static int getMostFreqKSimilarity(int[] hash1, int[] hash2) {
int similarity = 0;
int[] h1 = decode(hash1);
int[] h2 = decode(hash2);
for (int i = 0; i < h1.length; i++) {
char c = (char) h1[i++];
int freq1 = h1[i];
for (int i = 0; i < hash1.length; i++) {
char c = (char) hash1[i++];
int freq1 = hash1[i];
// System.err.println("c=" + c + " f=" + freq1);
int freq2 = findFrequency(h2, c);
int freq2 = findFrequency(hash2, c);
if (freq2 >= 0) {
// System.err.println("found c=" + c + " f2=" + freq2);
similarity += freq1 + freq2;
similarity += Math.min(freq1, freq2);
}
}
......@@ -122,7 +146,29 @@ public class MostFrequentKChars {
* </pre>
*/
public static int mostFreqKSDF(String inputStr1, String inputStr2, int K, int maxDistance) {
return maxDistance - mostFreqKSimilarity(mostFrequentKHashing(inputStr1, K), mostFrequentKHashing(inputStr2, K));
return maxDistance - getMostFreqKSimilarity(calculateHash(inputStr1, K), calculateHash(inputStr2, K));
}
public static double mostFreqKSDF(String inputStr1, String inputStr2, int K) {
int[] hash1 = calculateHash(inputStr1, K);
int[] hash2 = calculateHash(inputStr2, K);
return 1.0 * getMostFreqKSimilarity(hash1, hash2) / (Math.max(getFrequencySum(hash1), getFrequencySum(hash2)));
}
/**
* Get the sum of frequencies of all chars represented in the hash
*
* @param hash the hash
* @return sum of character frequencies
*/
static double getFrequencySum(int[] hash) {
// System.err.println(Arrays.toString(hash));
double sum = 0;
for (int i = 1; i < hash.length; i += 2) {
int freq = hash[i];
sum += freq;
}
return sum;
}
/**
......@@ -147,7 +193,7 @@ public class MostFrequentKChars {
* @param hash1
* @return
*/
static int[] decode(String hash1) {
static int[] decodeHash(String hash1) {
int[] h = new int[hash1.length()];
int pos = 0;
for (int i = 0; i < hash1.length(); i++) {
......@@ -165,4 +211,24 @@ public class MostFrequentKChars {
// System.err.println(Arrays.toString(h));
// return h;
}
/**
* Encode a hash array to String
*
* @param h1 hash array as generated
* @return String representation of the hash array (e.g. "i3b2")
*/
public static String toHashString(int[] h1) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < h1.length; i++) {
char c = (char) h1[i++];
if (c == 0)
break;
int freq = h1[i];
sb.append(c).append(freq);
}
return sb.toString();
}
}
......@@ -25,42 +25,79 @@ import org.junit.Test;
public class MostFrequentKCharsTest {
@Test
public void test1() {
assertThat(MostFrequentKChars.mostFrequentKHashing("night", 2), equalTo("n1i1"));
assertThat(MostFrequentKChars.mostFrequentKHashing("nacht", 2), equalTo("n1a1"));
assertThat(MostFrequentKChars.mostFrequentKHashing("research", 2), equalTo("r2e2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("research", 2), equalTo("r2e2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("aaaaabbbb", 2), equalTo("a5b4"));
assertThat(MostFrequentKChars.mostFrequentKHashing("ababababa", 2), equalTo("a5b4"));
assertThat(MostFrequentKChars.mostFrequentKHashing("significant", 2), equalTo("i3n2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("capabilities", 2), equalTo("i3a2"));
assertThat(MostFrequentKChars.getMostFrequentKHash("night", 2), equalTo("n1i1"));
assertThat(MostFrequentKChars.getMostFrequentKHash("nacht", 2), equalTo("n1a1"));
assertThat(MostFrequentKChars.getMostFrequentKHash("research", 2), equalTo("r2e2"));
assertThat(MostFrequentKChars.getMostFrequentKHash("research", 2), equalTo("r2e2"));
assertThat(MostFrequentKChars.getMostFrequentKHash("aaaaabbbb", 2), equalTo("a5b4"));
assertThat(MostFrequentKChars.getMostFrequentKHash("ababababa", 2), equalTo("a5b4"));
assertThat(MostFrequentKChars.getMostFrequentKHash("significant", 2), equalTo("i3n2"));
assertThat(MostFrequentKChars.getMostFrequentKHash("capabilities", 2), equalTo("i3a2"));
assertThat(MostFrequentKChars.mostFrequentKHashing("LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV", 2), equalTo("L9T8"));
assertThat(MostFrequentKChars.mostFrequentKHashing("EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG", 2), equalTo("F9L8"));
assertThat(MostFrequentKChars.getMostFrequentKHash("LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV", 2), equalTo("L9T8"));
assertThat(MostFrequentKChars.getMostFrequentKHash("EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG", 2), equalTo("F9L8"));
assertThat(MostFrequentKChars.getMostFrequentKHash("tetraphyllum", 25), equalTo("t2l2e1r1a1p1h1y1u1m1"));
assertThat(MostFrequentKChars.getMostFrequentKHash("helloweeny", 25), equalTo("e3l2h1o1w1n1y1"));
}
@Test
public void testSimilarity() {
assertThat(MostFrequentKChars.mostFreqKSimilarity("n1i1", "n1a1"), equalTo(2));
assertThat(MostFrequentKChars.mostFreqKSimilarity("m1y1", "a1"), equalTo(0));
assertThat(MostFrequentKChars.mostFreqKSimilarity("r2e2", "r2e2"), equalTo(8));
assertThat(MostFrequentKChars.mostFreqKSimilarity("a5b4", "a5b4"), equalTo(18));
assertThat(MostFrequentKChars.mostFreqKSimilarity("i3n2", "i3a2"), equalTo(6));
assertThat(MostFrequentKChars.mostFreqKSimilarity("L9T8", "F9L8"), equalTo(17));
assertThat(MostFrequentKChars.getMostFreqKSimilarity("n1i1", "n1a1"), equalTo(1));
assertThat(MostFrequentKChars.getMostFreqKSimilarity("m1y1", "a1"), equalTo(0));
assertThat(MostFrequentKChars.getMostFreqKSimilarity("r2e2", "r2e2"), equalTo(4));
assertThat(MostFrequentKChars.getMostFreqKSimilarity("a5b4", "a5b4"), equalTo(9));
assertThat(MostFrequentKChars.getMostFreqKSimilarity("i3n2", "i3a2"), equalTo(3));
assertThat(MostFrequentKChars.getMostFreqKSimilarity("L9T8", "F9L8"), equalTo(8));
}
@Test
public void testDecode() {
assertTrue(Arrays.equals(MostFrequentKChars.decode("a1"), new int[] { 'a', 1 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11"), new int[] { 'a', 11 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11b2"), new int[] { 'a', 11, 'b', 2 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11b22"), new int[] { 'a', 11, 'b', 22 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("a11b22e8"), new int[] { 'a', 11, 'b', 22, 'e', 8 }));
assertTrue(Arrays.equals(MostFrequentKChars.decode("i3n2"), new int[] { 'i', 3, 'n', 2 }));
assertTrue(Arrays.equals(MostFrequentKChars.decodeHash("a1"), new int[] { 'a', 1 }));
assertTrue(Arrays.equals(MostFrequentKChars.decodeHash("a11"), new int[] { 'a', 11 }));
assertTrue(Arrays.equals(MostFrequentKChars.decodeHash("a11b2"), new int[] { 'a', 11, 'b', 2 }));
assertTrue(Arrays.equals(MostFrequentKChars.decodeHash("a11b22"), new int[] { 'a', 11, 'b', 22 }));
assertTrue(Arrays.equals(MostFrequentKChars.decodeHash("a11b22e8"), new int[] { 'a', 11, 'b', 22, 'e', 8 }));
assertTrue(Arrays.equals(MostFrequentKChars.decodeHash("i3n2"), new int[] { 'i', 3, 'n', 2 }));
}
@Test
public void testEncode() {
assertThat(MostFrequentKChars.toHashString(new int[] {}), equalTo(""));
assertThat(MostFrequentKChars.toHashString(new int[] { 'a', 1 }), equalTo("a1"));
assertThat(MostFrequentKChars.toHashString(new int[] { 'a', 11 }), equalTo("a11"));
assertThat(MostFrequentKChars.toHashString(new int[] { 'a', 11, 'b', 2 }), equalTo("a11b2"));
assertThat(MostFrequentKChars.toHashString(new int[] { 'a', 11, 'b', 22 }), equalTo("a11b22"));
assertThat(MostFrequentKChars.toHashString(new int[] { 'a', 11, 'b', 22, 'e', 8 }), equalTo("a11b22e8"));
assertThat(MostFrequentKChars.toHashString(new int[] { 'i', 3, 'n', 2 }), equalTo("i3n2"));
}
@Test
public void testFrequencySum() {
assertThat(MostFrequentKChars.getFrequencySum(MostFrequentKChars.decodeHash("a1")), equalTo(1d));
assertThat(MostFrequentKChars.getFrequencySum(MostFrequentKChars.decodeHash("a11b2")), equalTo(13d));
assertThat(MostFrequentKChars.getFrequencySum(MostFrequentKChars.decodeHash("a11b3c0")), equalTo(14d));
assertThat(MostFrequentKChars.getFrequencySum(MostFrequentKChars.decodeHash("")), equalTo(0d));
}
@Test
public void testMostFreqKMostFrequentKChars() {
assertThat(MostFrequentKChars.mostFreqKSDF("LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV", "EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG",
2, 100), equalTo(83));
2, 100), equalTo(92));
// foobar("Matija", "Mataja", 3);
// foobar("Matija", "Mataja", 2);
// foobar("Matija", "Mataja", 4);
// foobar("Matija Obreza", "Mataja Obreza", 6);
foobar("tetraphyllum", "tetraphylla", 5);
foobar("tetraphyllum", "tetraphylla", 100);
foobar("this is not a name", "the song lambada", 6);
foobar("Vicia faba", "Vicia fabiana", 6);
}
private void foobar(String string1, String string2, int K) {
// System.err.println("hashes " + MostFrequentKChars.mostFrequentKHashing(string1, K) + " " + MostFrequentKChars.mostFrequentKHashing(string2, K));
// System.err.println("scores " + MostFrequentKChars.mostFreqKSDF(string1, string2, K) + " " + MostFrequentKChars.mostFreqKSDF(string2, string1, K));
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment