Просмотр исходного кода

Added canonical encoding and decoding tree

AvariceLHubris 1 год назад
Родитель
Сommit
585d4a40d6
5 измененных файлов с 238 добавлено и 3 удалено
  1. 7 0
      Cargo.lock
  2. 1 0
      Cargo.toml
  3. 14 3
      src/hufftree/base.rs
  4. 214 0
      src/hufftree/canonical.rs
  5. 2 0
      src/hufftree/mod.rs

+ 7 - 0
Cargo.lock

@@ -2,6 +2,12 @@
 # It is not intended for manual editing.
 version = 3
 
+[[package]]
+name = "bimap"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "230c5f1ca6a325a32553f8640d31ac9b49f2411e901e427570154868b46da4f7"
+
 [[package]]
 name = "bit-vec"
 version = "0.8.0"
@@ -12,5 +18,6 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
 name = "huffman"
 version = "0.1.0"
 dependencies = [
+ "bimap",
  "bit-vec",
 ]

+ 1 - 0
Cargo.toml

@@ -5,3 +5,4 @@ edition = "2021"
 
 [dependencies]
 bit-vec = "0.8"
+bimap = "0.6.3"

+ 14 - 3
src/hufftree.rs → src/hufftree/base.rs

@@ -1,19 +1,22 @@
-use std::collections::HashMap;
-
 use bit_vec::BitVec;
+use std::collections::HashMap;
 
 use crate::node::Node;
 
+#[derive(Debug)]
 pub struct Hufftree {
     root: Node,
+    characters: Vec<char>,
 }
 
 impl Hufftree {
     pub fn new(chars_and_freq: HashMap<char, i32>) -> Hufftree {
         let mut nodes: Vec<Node> = Vec::new();
+        let mut characters: Vec<char> = Vec::new();
 
         for (c, f) in chars_and_freq {
             nodes.push(Node::new_with_character(c, f));
+            characters.push(c);
         }
 
         nodes.sort();
@@ -37,6 +40,7 @@ impl Hufftree {
 
         Hufftree {
             root: nodes.pop().expect("There was no node in the vector."),
+            characters,
         }
     }
 
@@ -59,6 +63,10 @@ impl Hufftree {
 
         Ok(output)
     }
+
+    pub fn get_characters(&self) -> &Vec<char> {
+        &self.characters
+    }
 }
 
 #[cfg(test)]
@@ -73,7 +81,10 @@ mod test {
         chars_and_freq.insert('c', 5);
 
         let huff = Hufftree::new(chars_and_freq);
-        assert_eq!(huff.root.get_frequency(), 44)
+        assert_eq!(huff.root.get_frequency(), 44);
+        assert!(huff.characters.contains(&'a'));
+        assert!(huff.characters.contains(&'b'));
+        assert!(huff.characters.contains(&'c'));
     }
 
     #[test]

+ 214 - 0
src/hufftree/canonical.rs

@@ -0,0 +1,214 @@
+use crate::hufftree::base::Hufftree;
+use bimap::BiMap;
+use bit_vec::BitVec;
+
+#[derive(Debug)]
+pub struct CanonicalHufftree {
+    base_tree: Hufftree,
+    characters_and_codes: BiMap<char, BitVec>,
+}
+
+#[derive(PartialEq, PartialOrd, Eq, Debug)]
+struct CharTempCode {
+    character: char,
+    code: BitVec,
+    code_length: usize,
+}
+
+impl Ord for CharTempCode {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        return self.code_length.cmp(&other.code_length);
+    }
+}
+
+impl CanonicalHufftree {
+    pub fn from_tree(base_tree: Hufftree) -> Self {
+        let characters = base_tree.get_characters();
+        let mut character_and_codes = Vec::new();
+        let mut output_characters_and_codes: BiMap<char, BitVec> = BiMap::new();
+
+        for character in characters {
+            let code = base_tree.get_character_code(*character).unwrap();
+            let length = code.len();
+            character_and_codes.push(CharTempCode {
+                character: *character,
+                code,
+                code_length: length,
+            });
+        }
+
+        character_and_codes.sort();
+        let mut character_and_codes: Vec<CharTempCode> =
+            character_and_codes.into_iter().rev().collect();
+
+        let mut first = true;
+        let mut prev_length = 0;
+        let mut working_code: u32 = 0b0;
+        while character_and_codes.len() > 0 {
+            let temp_char = character_and_codes.pop().unwrap();
+            if first {
+                let mut code = BitVec::new();
+                code.grow(temp_char.code_length, false);
+                prev_length = temp_char.code_length;
+
+                output_characters_and_codes.insert(temp_char.character, code);
+                first = false;
+                continue;
+            }
+
+            if temp_char.code_length > prev_length {
+                working_code += 1;
+                working_code = working_code << (temp_char.code_length - prev_length);
+
+                output_characters_and_codes
+                    .insert(temp_char.character, convert_no_to_bit_vec(working_code));
+            } else {
+                assert_eq!(
+                    temp_char.code_length, prev_length,
+                    "Something went really wrong if we got here."
+                );
+                working_code += 1;
+                output_characters_and_codes
+                    .insert(temp_char.character, convert_no_to_bit_vec(working_code));
+            }
+
+            prev_length = temp_char.code_length;
+        }
+
+        CanonicalHufftree {
+            base_tree,
+            characters_and_codes: output_characters_and_codes,
+        }
+    }
+
+    // TODO: Optimise this (the vector copying is probably extremely inefficient)
+    pub fn encode_text(&self, text: &String) -> BitVec {
+        let mut converted_text = BitVec::new();
+
+        for character in text.chars() {
+            let temp_code = self.characters_and_codes.get_by_left(&character).unwrap();
+            let mut temp_code = temp_code.clone();
+            converted_text.append(&mut temp_code);
+        }
+
+        converted_text
+    }
+
+    pub fn decode_text(&self, text: BitVec) -> Result<String, &str> {
+        let mut decoded_text = String::new();
+
+        // So that popping bits removes them from the "start" of the text.
+        let mut text: BitVec = text.iter().rev().collect();
+        println!("Reversed encoded text: {:?}", text);
+        let mut buffer = BitVec::new();
+
+        while text.len() > 0 {
+            buffer.push(text.pop().unwrap());
+            match self.characters_and_codes.get_by_right(&buffer) {
+                Some(character) => {
+                    decoded_text.push(*character);
+                    buffer.truncate(0);
+                }
+                None => continue,
+            }
+        }
+
+        if !buffer.is_empty() {
+            Err("Text was not decoded properly (trailing bits).")
+        } else {
+            Ok(decoded_text)
+        }
+    }
+}
+
+fn convert_no_to_bit_vec(mut numb: u32) -> BitVec {
+    let mut output_vec = BitVec::new();
+    while numb > 0 {
+        if numb % 2 == 0 {
+            output_vec.push(false);
+        } else {
+            output_vec.push(true);
+        }
+        numb = numb / 2;
+    }
+    let output_vec = output_vec.iter().rev().collect();
+    output_vec
+}
+
+#[cfg(test)]
+mod canonical_tests {
+    use std::collections::HashMap;
+
+    use super::*;
+
+    #[test]
+    fn correct_conversion_of_number() {
+        let numb = 0b111001;
+        let result = convert_no_to_bit_vec(numb);
+
+        assert!(result.eq_vec(&[true, true, true, false, false, true]));
+    }
+
+    #[test]
+    fn create_correct_canonical_codes() {
+        let mut chars_and_freq: HashMap<char, i32> = HashMap::new();
+        chars_and_freq.insert('a', 25);
+        chars_and_freq.insert('b', 14);
+        chars_and_freq.insert('c', 5);
+
+        let huff = Hufftree::new(chars_and_freq);
+
+        let a_code = huff.get_character_code('a').unwrap();
+        let b_code = huff.get_character_code('b').unwrap();
+        let c_code = huff.get_character_code('c').unwrap();
+        assert_eq!(a_code.to_string(), "1");
+        assert_eq!(b_code.to_string(), "01");
+        assert_eq!(c_code.to_string(), "00");
+
+        let canonical = CanonicalHufftree::from_tree(huff);
+        assert_eq!(
+            canonical
+                .characters_and_codes
+                .get_by_left(&'a')
+                .unwrap()
+                .to_string(),
+            "0"
+        );
+        assert_eq!(
+            canonical
+                .characters_and_codes
+                .get_by_left(&'b')
+                .unwrap()
+                .to_string(),
+            "10"
+        );
+
+        assert_eq!(
+            canonical
+                .characters_and_codes
+                .get_by_left(&'c')
+                .unwrap()
+                .to_string(),
+            "11"
+        );
+    }
+
+    #[test]
+    fn encode_and_decode_text() {
+        let mut chars_and_freq: HashMap<char, i32> = HashMap::new();
+        chars_and_freq.insert('a', 25);
+        chars_and_freq.insert('b', 14);
+        chars_and_freq.insert('c', 5);
+
+        let huff = Hufftree::new(chars_and_freq);
+        let canonical = CanonicalHufftree::from_tree(huff);
+
+        let input_text = String::from("aaabacacaaaabbbbbbbccccccccccccaacc");
+        let encoded_text = canonical.encode_text(&input_text);
+        println!("Encoded text:{:?}", encoded_text);
+
+        let decoded_text = canonical.decode_text(encoded_text).unwrap();
+
+        assert_eq!(input_text, decoded_text);
+    }
+}

+ 2 - 0
src/hufftree/mod.rs

@@ -0,0 +1,2 @@
+pub mod canonical;
+mod base;