canonical.rs 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. use crate::hufftree::base::Hufftree;
  2. use bimap::BiMap;
  3. use bit_vec::BitVec;
  4. #[derive(Debug)]
  5. pub struct CanonicalHufftree {
  6. characters_and_codes: BiMap<char, BitVec>,
  7. storage_char_codes: Vec<(char, u32)>,
  8. }
  9. #[derive(Debug)]
  10. struct CharTempCode {
  11. character: char,
  12. code: BitVec,
  13. code_length: usize,
  14. }
  15. impl Ord for CharTempCode {
  16. fn cmp(&self, other: &Self) -> std::cmp::Ordering {
  17. self.code_length
  18. .cmp(&other.code_length)
  19. .then(other.code.cmp(&self.code))
  20. }
  21. }
  22. impl PartialOrd for CharTempCode {
  23. fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
  24. Some(self.cmp(other))
  25. }
  26. }
  27. impl PartialEq for CharTempCode {
  28. fn eq(&self, other: &Self) -> bool {
  29. self.code_length == other.code_length
  30. }
  31. }
  32. impl Eq for CharTempCode {}
  33. impl CanonicalHufftree {
  34. pub fn from_tree(base_tree: Hufftree) -> Self {
  35. let characters = base_tree.get_characters();
  36. let mut character_and_codes = Vec::new();
  37. let mut output_characters_and_codes: BiMap<char, BitVec> = BiMap::new();
  38. let mut storage_char_codes = Vec::new();
  39. for character in characters {
  40. let code = base_tree.get_character_code(*character).unwrap();
  41. let length = code.len();
  42. character_and_codes.push(CharTempCode {
  43. character: *character,
  44. code,
  45. code_length: length,
  46. });
  47. }
  48. character_and_codes.sort();
  49. let mut character_and_codes: Vec<CharTempCode> =
  50. character_and_codes.into_iter().rev().collect();
  51. // println!("Ordered characters: {:?}", character_and_codes);
  52. let mut first = true;
  53. let mut prev_length = 0;
  54. let mut working_code: u32 = 0b0;
  55. while let Some(temp_char) = character_and_codes.pop() {
  56. // This will results in the vector being ordered with the most frequent
  57. // character first, ordered by code length.
  58. storage_char_codes.push((temp_char.character, temp_char.code_length as u32));
  59. let mut code = BitVec::new();
  60. code.grow(temp_char.code_length, false);
  61. if first {
  62. prev_length = temp_char.code_length;
  63. output_characters_and_codes.insert(temp_char.character, code);
  64. first = false;
  65. continue;
  66. }
  67. if temp_char.code_length > prev_length {
  68. working_code += 1;
  69. working_code <<= temp_char.code_length - prev_length;
  70. let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
  71. output_characters_and_codes
  72. .insert_no_overwrite(temp_char.character, code)
  73. .expect("There was already a character with that code.");
  74. assert!(output_characters_and_codes.contains_left(&temp_char.character));
  75. } else {
  76. assert_eq!(
  77. temp_char.code_length, prev_length,
  78. "Something went really wrong if we got here."
  79. );
  80. working_code += 1;
  81. let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
  82. output_characters_and_codes
  83. .insert_no_overwrite(temp_char.character, code)
  84. .expect("There was already a character with that code.");
  85. }
  86. prev_length = temp_char.code_length;
  87. }
  88. CanonicalHufftree {
  89. characters_and_codes: output_characters_and_codes,
  90. storage_char_codes,
  91. }
  92. }
  93. pub fn from_vec(storage: Vec<(char, u32)>) -> Self {
  94. // Assume that vec is ordered, first by code length,
  95. // then by character frequency, as this is the way
  96. // the file should have been encoded.
  97. // This means that it must first be reversed.
  98. let mut temp_storage: Vec<(char, u32)> = storage.into_iter().rev().collect();
  99. let mut bi = BiMap::new();
  100. let mut first = true;
  101. let mut prev_length = 0;
  102. let mut working_code: u32 = 0b0;
  103. while let Some((temp_char, code_length)) = temp_storage.pop() {
  104. // This will result in the vector being ordered with the most frequent
  105. // character first, ordered by code length.
  106. //
  107. let mut code = BitVec::new();
  108. code.grow(code_length as usize, false);
  109. if first {
  110. prev_length = code_length;
  111. bi.insert(temp_char, code);
  112. first = false;
  113. continue;
  114. }
  115. if code_length > prev_length {
  116. working_code += 1;
  117. working_code <<= code_length - prev_length;
  118. let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
  119. bi.insert_no_overwrite(temp_char, code)
  120. .expect("There was already a character with that code.");
  121. } else {
  122. assert_eq!(
  123. code_length, prev_length,
  124. "Something went really wrong if we got here."
  125. );
  126. working_code += 1;
  127. let code = convert_no_to_bit_vec_known_length(working_code, &mut code);
  128. bi.insert_no_overwrite(temp_char, code)
  129. .expect("There was already a character with that code.");
  130. }
  131. prev_length = code_length;
  132. }
  133. CanonicalHufftree {
  134. characters_and_codes: bi,
  135. // Will be empty.
  136. storage_char_codes: temp_storage,
  137. }
  138. }
  139. pub fn encode_text(&self, text: &str) -> Result<BitVec, &'static str> {
  140. let total_bits = text
  141. .chars()
  142. .map(|c| self.characters_and_codes.get_by_left(&c).map_or(0, |code| code.len()))
  143. .sum();
  144. let mut converted_text = BitVec::with_capacity(total_bits);
  145. for character in text.chars() {
  146. let code = self.characters_and_codes
  147. .get_by_left(&character)
  148. .ok_or("Character not found in encoding table.")?;
  149. for bit in code.iter() {
  150. converted_text.push(bit);
  151. }
  152. }
  153. Ok(converted_text)
  154. }
  155. pub fn decode_text(&self, text: BitVec) -> Result<String, &'static str> {
  156. let mut decoded_text = String::new();
  157. // So that popping bits removes them from the "start" of the text.
  158. let mut text: BitVec = text.iter().rev().collect();
  159. let mut buffer = BitVec::new();
  160. while !text.is_empty() {
  161. buffer.push(text.pop().unwrap());
  162. match self.characters_and_codes.get_by_right(&buffer) {
  163. Some(character) => {
  164. decoded_text.push(*character);
  165. buffer.truncate(0);
  166. }
  167. None => continue,
  168. }
  169. }
  170. if !buffer.is_empty() {
  171. Err("Text was not decoded properly (trailing bits).")
  172. } else {
  173. Ok(decoded_text)
  174. }
  175. }
  176. pub fn get_character_codes_for_storage(&self) -> &[(char, u32)] {
  177. &self.storage_char_codes
  178. }
  179. }
  180. pub fn convert_no_to_bit_vec(mut numb: u32) -> BitVec {
  181. let mut output_vec = BitVec::new();
  182. while numb > 0 {
  183. if numb % 2 == 0 {
  184. output_vec.push(false);
  185. } else {
  186. output_vec.push(true);
  187. }
  188. numb /= 2;
  189. }
  190. let output_vec = output_vec.iter().rev().collect();
  191. output_vec
  192. }
  193. pub fn convert_no_to_bit_vec_known_length(mut numb: u32, bits: &mut BitVec) -> BitVec {
  194. let mut counter = 0;
  195. while numb > 0 {
  196. if numb % 2 == 0 {
  197. bits.set(counter, false);
  198. } else {
  199. bits.set(counter, true);
  200. }
  201. numb /= 2;
  202. counter += 1;
  203. }
  204. let bits = bits.iter().rev().collect();
  205. bits
  206. }
  207. #[cfg(test)]
  208. mod canonical_tests {
  209. use std::collections::HashMap;
  210. use super::*;
  211. #[test]
  212. fn correct_conversion_of_number() {
  213. let numb = 0b111001;
  214. let result = convert_no_to_bit_vec(numb);
  215. assert!(result.eq_vec(&[true, true, true, false, false, true]));
  216. }
  217. #[test]
  218. fn create_correct_canonical_codes() {
  219. let mut chars_and_freq: HashMap<char, i32> = HashMap::new();
  220. chars_and_freq.insert('a', 25);
  221. chars_and_freq.insert('b', 14);
  222. chars_and_freq.insert('c', 5);
  223. let huff = Hufftree::new(chars_and_freq);
  224. let a_code = huff.get_character_code('a').unwrap();
  225. let b_code = huff.get_character_code('b').unwrap();
  226. let c_code = huff.get_character_code('c').unwrap();
  227. assert_eq!(a_code.to_string(), "1");
  228. assert_eq!(b_code.to_string(), "01");
  229. assert_eq!(c_code.to_string(), "00");
  230. let canonical = CanonicalHufftree::from_tree(huff);
  231. assert_eq!(
  232. canonical
  233. .characters_and_codes
  234. .get_by_left(&'a')
  235. .unwrap()
  236. .to_string(),
  237. "0"
  238. );
  239. assert_eq!(
  240. canonical
  241. .characters_and_codes
  242. .get_by_left(&'b')
  243. .unwrap()
  244. .to_string(),
  245. "10"
  246. );
  247. assert_eq!(
  248. canonical
  249. .characters_and_codes
  250. .get_by_left(&'c')
  251. .unwrap()
  252. .to_string(),
  253. "11"
  254. );
  255. }
  256. #[test]
  257. fn encode_and_decode_text() {
  258. let mut chars_and_freq: HashMap<char, i32> = HashMap::new();
  259. chars_and_freq.insert('a', 25);
  260. chars_and_freq.insert('b', 14);
  261. chars_and_freq.insert('c', 5);
  262. let huff = Hufftree::new(chars_and_freq);
  263. let canonical = CanonicalHufftree::from_tree(huff);
  264. let input_text = String::from("aaabacacaaaabbbbbbbccccccccccccaacc");
  265. let encoded_text = canonical.encode_text(&input_text).unwrap();
  266. let decoded_text = canonical.decode_text(encoded_text).unwrap();
  267. assert_eq!(input_text, decoded_text);
  268. }
  269. }