| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324 |
- use crate::hufftree::base::Hufftree;
- use bimap::BiMap;
- use bit_vec::BitVec;
- #[derive(Debug)]
- pub struct CanonicalHufftree {
- characters_and_codes: BiMap<char, BitVec>,
- storage_char_codes: Vec<(char, u32)>,
- }
- #[derive(Debug)]
- struct CharTempCode {
- character: char,
- code: BitVec,
- code_length: usize,
- }
- impl Ord for CharTempCode {
- fn cmp(&self, other: &Self) -> std::cmp::Ordering {
- self.code_length
- .cmp(&other.code_length)
- .then(other.code.cmp(&self.code))
- }
- }
- impl PartialOrd for CharTempCode {
- fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
- Some(self.cmp(other))
- }
- }
- impl PartialEq for CharTempCode {
- fn eq(&self, other: &Self) -> bool {
- self.code_length == other.code_length
- }
- }
- impl Eq for CharTempCode {}
- impl CanonicalHufftree {
- pub fn from_tree(base_tree: Hufftree) -> Self {
- let characters = base_tree.get_characters();
- let mut character_and_codes = Vec::new();
- let mut output_characters_and_codes: BiMap<char, BitVec> = BiMap::new();
- let mut storage_char_codes = Vec::new();
- for character in characters {
- let code = base_tree.get_character_code(*character).unwrap();
- let length = code.len();
- character_and_codes.push(CharTempCode {
- character: *character,
- code,
- code_length: length,
- });
- }
- character_and_codes.sort();
- let mut character_and_codes: Vec<CharTempCode> =
- character_and_codes.into_iter().rev().collect();
- // println!("Ordered characters: {:?}", character_and_codes);
- let mut first = true;
- let mut prev_length = 0;
- let mut working_code: u32 = 0b0;
- while let Some(temp_char) = character_and_codes.pop() {
- // This will results in the vector being ordered with the most frequent
- // character first, ordered by code length.
- storage_char_codes.push((temp_char.character, temp_char.code_length as u32));
- let mut code = BitVec::new();
- code.grow(temp_char.code_length, false);
- if first {
- prev_length = temp_char.code_length;
- output_characters_and_codes.insert(temp_char.character, code);
- first = false;
- continue;
- }
- if temp_char.code_length > prev_length {
- working_code += 1;
- working_code <<= temp_char.code_length - prev_length;
- let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
- output_characters_and_codes
- .insert_no_overwrite(temp_char.character, code)
- .expect("There was already a character with that code.");
- assert!(output_characters_and_codes.contains_left(&temp_char.character));
- } else {
- assert_eq!(
- temp_char.code_length, prev_length,
- "Something went really wrong if we got here."
- );
- working_code += 1;
- let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
- output_characters_and_codes
- .insert_no_overwrite(temp_char.character, code)
- .expect("There was already a character with that code.");
- }
- prev_length = temp_char.code_length;
- }
- CanonicalHufftree {
- characters_and_codes: output_characters_and_codes,
- storage_char_codes,
- }
- }
- pub fn from_vec(storage: Vec<(char, u32)>) -> Self {
- // Assume that vec is ordered, first by code length,
- // then by character frequency, as this is the way
- // the file should have been encoded.
- // This means that it must first be reversed.
- let mut temp_storage: Vec<(char, u32)> = storage.into_iter().rev().collect();
- let mut bi = BiMap::new();
- let mut first = true;
- let mut prev_length = 0;
- let mut working_code: u32 = 0b0;
- while let Some((temp_char, code_length)) = temp_storage.pop() {
- // This will result in the vector being ordered with the most frequent
- // character first, ordered by code length.
- //
- let mut code = BitVec::new();
- code.grow(code_length as usize, false);
- if first {
- prev_length = code_length;
- bi.insert(temp_char, code);
- first = false;
- continue;
- }
- if code_length > prev_length {
- working_code += 1;
- working_code <<= code_length - prev_length;
- let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
- bi.insert_no_overwrite(temp_char, code)
- .expect("There was already a character with that code.");
- } else {
- assert_eq!(
- code_length, prev_length,
- "Something went really wrong if we got here."
- );
- working_code += 1;
- let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
- bi.insert_no_overwrite(temp_char, code)
- .expect("There was already a character with that code.");
- }
- prev_length = code_length;
- }
- CanonicalHufftree {
- characters_and_codes: bi,
- // Will be empty.
- storage_char_codes: temp_storage,
- }
- }
- pub fn encode_text(&self, text: &str) -> Result<BitVec, &'static str> {
- let total_bits = text
- .chars()
- .map(|c| self.characters_and_codes.get_by_left(&c).map_or(0, |code| code.len()))
- .sum();
- let mut converted_text = BitVec::with_capacity(total_bits);
- for character in text.chars() {
- let code = self.characters_and_codes
- .get_by_left(&character)
- .ok_or("Character not found in encoding table.")?;
- for bit in code.iter() {
- converted_text.push(bit);
- }
- }
- Ok(converted_text)
- }
- pub fn decode_text(&self, text: BitVec) -> Result<String, &'static str> {
- let mut decoded_text = String::new();
- // So that popping bits removes them from the "start" of the text.
- let mut text: BitVec = text.iter().rev().collect();
- let mut buffer = BitVec::new();
- while !text.is_empty() {
- buffer.push(text.pop().unwrap());
- match self.characters_and_codes.get_by_right(&buffer) {
- Some(character) => {
- decoded_text.push(*character);
- buffer.truncate(0);
- }
- None => continue,
- }
- }
- if !buffer.is_empty() {
- Err("Text was not decoded properly (trailing bits).")
- } else {
- Ok(decoded_text)
- }
- }
- pub fn get_character_codes_for_storage(&self) -> &[(char, u32)] {
- &self.storage_char_codes
- }
- }
- pub fn convert_no_to_bit_vec(mut numb: u32) -> BitVec {
- let mut output_vec = BitVec::new();
- while numb > 0 {
- if numb % 2 == 0 {
- output_vec.push(false);
- } else {
- output_vec.push(true);
- }
- numb /= 2;
- }
- let output_vec = output_vec.iter().rev().collect();
- output_vec
- }
- pub fn convert_no_to_bit_vec_known_length(mut numb: u32, bits: &mut BitVec) -> BitVec {
- let mut counter = 0;
- while numb > 0 {
- if numb % 2 == 0 {
- bits.set(counter, false);
- } else {
- bits.set(counter, true);
- }
- numb /= 2;
- counter += 1;
- }
- let bits = bits.iter().rev().collect();
- bits
- }
- #[cfg(test)]
- mod canonical_tests {
- use std::collections::HashMap;
- use super::*;
- #[test]
- fn correct_conversion_of_number() {
- let numb = 0b111001;
- let result = convert_no_to_bit_vec(numb);
- assert!(result.eq_vec(&[true, true, true, false, false, true]));
- }
- #[test]
- fn create_correct_canonical_codes() {
- let mut chars_and_freq: HashMap<char, i32> = HashMap::new();
- chars_and_freq.insert('a', 25);
- chars_and_freq.insert('b', 14);
- chars_and_freq.insert('c', 5);
- let huff = Hufftree::new(chars_and_freq);
- let a_code = huff.get_character_code('a').unwrap();
- let b_code = huff.get_character_code('b').unwrap();
- let c_code = huff.get_character_code('c').unwrap();
- assert_eq!(a_code.to_string(), "1");
- assert_eq!(b_code.to_string(), "01");
- assert_eq!(c_code.to_string(), "00");
- let canonical = CanonicalHufftree::from_tree(huff);
- assert_eq!(
- canonical
- .characters_and_codes
- .get_by_left(&'a')
- .unwrap()
- .to_string(),
- "0"
- );
- assert_eq!(
- canonical
- .characters_and_codes
- .get_by_left(&'b')
- .unwrap()
- .to_string(),
- "10"
- );
- assert_eq!(
- canonical
- .characters_and_codes
- .get_by_left(&'c')
- .unwrap()
- .to_string(),
- "11"
- );
- }
- #[test]
- fn encode_and_decode_text() {
- let mut chars_and_freq: HashMap<char, i32> = HashMap::new();
- chars_and_freq.insert('a', 25);
- chars_and_freq.insert('b', 14);
- chars_and_freq.insert('c', 5);
- let huff = Hufftree::new(chars_and_freq);
- let canonical = CanonicalHufftree::from_tree(huff);
- let input_text = String::from("aaabacacaaaabbbbbbbccccccccccccaacc");
- let encoded_text = canonical.encode_text(&input_text).unwrap();
- let decoded_text = canonical.decode_text(encoded_text).unwrap();
- assert_eq!(input_text, decoded_text);
- }
- }
|