use crate::hufftree::base::Hufftree; use bimap::BiMap; use bit_vec::BitVec; #[derive(Debug)] pub struct CanonicalHufftree { characters_and_codes: BiMap, storage_char_codes: Vec<(char, u32)>, } #[derive(Debug)] struct CharTempCode { character: char, code: BitVec, code_length: usize, } impl Ord for CharTempCode { fn cmp(&self, other: &Self) -> std::cmp::Ordering { self.code_length .cmp(&other.code_length) .then(other.code.cmp(&self.code)) } } impl PartialOrd for CharTempCode { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl PartialEq for CharTempCode { fn eq(&self, other: &Self) -> bool { self.code_length == other.code_length } } impl Eq for CharTempCode {} impl CanonicalHufftree { pub fn from_tree(base_tree: Hufftree) -> Self { let characters = base_tree.get_characters(); let mut character_and_codes = Vec::new(); let mut output_characters_and_codes: BiMap = BiMap::new(); let mut storage_char_codes = Vec::new(); for character in characters { let code = base_tree.get_character_code(*character).unwrap(); let length = code.len(); character_and_codes.push(CharTempCode { character: *character, code, code_length: length, }); } character_and_codes.sort(); let mut character_and_codes: Vec = character_and_codes.into_iter().rev().collect(); // println!("Ordered characters: {:?}", character_and_codes); let mut first = true; let mut prev_length = 0; let mut working_code: u32 = 0b0; while let Some(temp_char) = character_and_codes.pop() { // This will results in the vector being ordered with the most frequent // character first, ordered by code length. storage_char_codes.push((temp_char.character, temp_char.code_length as u32)); let mut code = BitVec::new(); code.grow(temp_char.code_length, false); if first { prev_length = temp_char.code_length; output_characters_and_codes.insert(temp_char.character, code); first = false; continue; } if temp_char.code_length > prev_length { working_code += 1; working_code <<= temp_char.code_length - prev_length; let code = convert_no_to_bit_vec_known_length(working_code, &mut code); output_characters_and_codes .insert_no_overwrite(temp_char.character, code) .expect("There was already a character with that code."); assert!(output_characters_and_codes.contains_left(&temp_char.character)); } else { assert_eq!( temp_char.code_length, prev_length, "Something went really wrong if we got here." ); working_code += 1; let code = convert_no_to_bit_vec_known_length(working_code, &mut code); output_characters_and_codes .insert_no_overwrite(temp_char.character, code) .expect("There was already a character with that code."); } prev_length = temp_char.code_length; } CanonicalHufftree { characters_and_codes: output_characters_and_codes, storage_char_codes, } } pub fn from_vec(storage: Vec<(char, u32)>) -> Self { // Assume that vec is ordered, first by code length, // then by character frequency, as this is the way // the file should have been encoded. // This means that it must first be reversed. let mut temp_storage: Vec<(char, u32)> = storage.into_iter().rev().collect(); let mut bi = BiMap::new(); let mut first = true; let mut prev_length = 0; let mut working_code: u32 = 0b0; while let Some((temp_char, code_length)) = temp_storage.pop() { // This will result in the vector being ordered with the most frequent // character first, ordered by code length. // let mut code = BitVec::new(); code.grow(code_length as usize, false); if first { prev_length = code_length; bi.insert(temp_char, code); first = false; continue; } if code_length > prev_length { working_code += 1; working_code <<= code_length - prev_length; let code = convert_no_to_bit_vec_known_length(working_code, &mut code); bi.insert_no_overwrite(temp_char, code) .expect("There was already a character with that code."); } else { assert_eq!( code_length, prev_length, "Something went really wrong if we got here." ); working_code += 1; let code = convert_no_to_bit_vec_known_length(working_code, &mut code); bi.insert_no_overwrite(temp_char, code) .expect("There was already a character with that code."); } prev_length = code_length; } CanonicalHufftree { characters_and_codes: bi, // Will be empty. storage_char_codes: temp_storage, } } pub fn encode_text(&self, text: &str) -> Result { let total_bits = text .chars() .map(|c| self.characters_and_codes.get_by_left(&c).map_or(0, |code| code.len())) .sum(); let mut converted_text = BitVec::with_capacity(total_bits); for character in text.chars() { let code = self.characters_and_codes .get_by_left(&character) .ok_or("Character not found in encoding table.")?; for bit in code.iter() { converted_text.push(bit); } } Ok(converted_text) } pub fn decode_text(&self, text: BitVec) -> Result { let mut decoded_text = String::new(); // So that popping bits removes them from the "start" of the text. let mut text: BitVec = text.iter().rev().collect(); let mut buffer = BitVec::new(); while !text.is_empty() { buffer.push(text.pop().unwrap()); match self.characters_and_codes.get_by_right(&buffer) { Some(character) => { decoded_text.push(*character); buffer.truncate(0); } None => continue, } } if !buffer.is_empty() { Err("Text was not decoded properly (trailing bits).") } else { Ok(decoded_text) } } pub fn get_character_codes_for_storage(&self) -> &[(char, u32)] { &self.storage_char_codes } } pub fn convert_no_to_bit_vec(mut numb: u32) -> BitVec { let mut output_vec = BitVec::new(); while numb > 0 { if numb % 2 == 0 { output_vec.push(false); } else { output_vec.push(true); } numb /= 2; } let output_vec = output_vec.iter().rev().collect(); output_vec } pub fn convert_no_to_bit_vec_known_length(mut numb: u32, bits: &mut BitVec) -> BitVec { let mut counter = 0; while numb > 0 { if numb % 2 == 0 { bits.set(counter, false); } else { bits.set(counter, true); } numb /= 2; counter += 1; } let bits = bits.iter().rev().collect(); bits } #[cfg(test)] mod canonical_tests { use std::collections::HashMap; use super::*; #[test] fn correct_conversion_of_number() { let numb = 0b111001; let result = convert_no_to_bit_vec(numb); assert!(result.eq_vec(&[true, true, true, false, false, true])); } #[test] fn create_correct_canonical_codes() { let mut chars_and_freq: HashMap = HashMap::new(); chars_and_freq.insert('a', 25); chars_and_freq.insert('b', 14); chars_and_freq.insert('c', 5); let huff = Hufftree::new(chars_and_freq); let a_code = huff.get_character_code('a').unwrap(); let b_code = huff.get_character_code('b').unwrap(); let c_code = huff.get_character_code('c').unwrap(); assert_eq!(a_code.to_string(), "1"); assert_eq!(b_code.to_string(), "01"); assert_eq!(c_code.to_string(), "00"); let canonical = CanonicalHufftree::from_tree(huff); assert_eq!( canonical .characters_and_codes .get_by_left(&'a') .unwrap() .to_string(), "0" ); assert_eq!( canonical .characters_and_codes .get_by_left(&'b') .unwrap() .to_string(), "10" ); assert_eq!( canonical .characters_and_codes .get_by_left(&'c') .unwrap() .to_string(), "11" ); } #[test] fn encode_and_decode_text() { let mut chars_and_freq: HashMap = HashMap::new(); chars_and_freq.insert('a', 25); chars_and_freq.insert('b', 14); chars_and_freq.insert('c', 5); let huff = Hufftree::new(chars_and_freq); let canonical = CanonicalHufftree::from_tree(huff); let input_text = String::from("aaabacacaaaabbbbbbbccccccccccccaacc"); let encoded_text = canonical.encode_text(&input_text).unwrap(); let decoded_text = canonical.decode_text(encoded_text).unwrap(); assert_eq!(input_text, decoded_text); } }