Просмотр исходного кода

Improve robustness and efficiency across encode/decode pipeline

- encode_text: pre-allocate with with_capacity, iterate bits directly
  to avoid per-character clone, return Result instead of panicking on
  unknown characters
- decode_text: annotate error lifetime as 'static (errors are literals)
- get_character_codes_for_storage: return &[(char, u32)] instead of
  cloning the Vec
- read_tree_and_text: return Result<String, anyhow::Error>, replacing
  all panicking unwraps/expects with propagated errors
- store_tree_and_text: return Result<(), anyhow::Error>, propagate
  encode_text errors
- Replace hand-rolled four_b_to_u32 with u32::from_be_bytes
- Remove commented-out debug println!s and dead code
AvariceLHubris 3 недель назад
Родитель
Сommit
80d275f6c3
3 измененных файлов с 81 добавлено и 145 удалено
  1. 18 15
      src/hufftree/canonical.rs
  2. 1 1
      src/main.rs
  3. 62 129
      src/storage.rs

+ 18 - 15
src/hufftree/canonical.rs

@@ -165,20 +165,27 @@ impl CanonicalHufftree {
         }
     }
 
-    // TODO: Optimise this (the vector copying is probably extremely inefficient)
-    pub fn encode_text(&self, text: &str) -> BitVec {
-        let mut converted_text = BitVec::new();
+    pub fn encode_text(&self, text: &str) -> Result<BitVec, &'static str> {
+        let total_bits = text
+            .chars()
+            .map(|c| self.characters_and_codes.get_by_left(&c).map_or(0, |code| code.len()))
+            .sum();
+
+        let mut converted_text = BitVec::with_capacity(total_bits);
 
         for character in text.chars() {
-            let temp_code = self.characters_and_codes.get_by_left(&character).unwrap();
-            let mut temp_code = temp_code.clone();
-            converted_text.append(&mut temp_code);
+            let code = self.characters_and_codes
+                .get_by_left(&character)
+                .ok_or("Character not found in encoding table.")?;
+            for bit in code.iter() {
+                converted_text.push(bit);
+            }
         }
 
-        converted_text
+        Ok(converted_text)
     }
 
-    pub fn decode_text(&self, text: BitVec) -> Result<String, &str> {
+    pub fn decode_text(&self, text: BitVec) -> Result<String, &'static str> {
         let mut decoded_text = String::new();
 
         // So that popping bits removes them from the "start" of the text.
@@ -203,12 +210,8 @@ impl CanonicalHufftree {
         }
     }
 
-    // pub fn get_character_codes(&self) -> BiMap<char, BitVec> {
-    //     self.characters_and_codes.clone()
-    // }
-
-    pub fn get_character_codes_for_storage(&self) -> Vec<(char, u32)> {
-        self.storage_char_codes.clone()
+    pub fn get_character_codes_for_storage(&self) -> &[(char, u32)] {
+        &self.storage_char_codes
     }
 }
 
@@ -312,7 +315,7 @@ mod canonical_tests {
         let canonical = CanonicalHufftree::from_tree(huff);
 
         let input_text = String::from("aaabacacaaaabbbbbbbccccccccccccaacc");
-        let encoded_text = canonical.encode_text(&input_text);
+        let encoded_text = canonical.encode_text(&input_text).unwrap();
 
         let decoded_text = canonical.decode_text(encoded_text).unwrap();
 

+ 1 - 1
src/main.rs

@@ -98,7 +98,7 @@ fn main() -> Result<(), anyhow::Error> {
     match mode {
         cli::Mode::X => {
             status!("Decoding text...");
-            let decoded_text = huffman::storage::read_tree_and_text(&mut &input_bytes[..]);
+            let decoded_text = huffman::storage::read_tree_and_text(&mut &input_bytes[..])?;
             status!("Decoded!");
 
             writer

+ 62 - 129
src/storage.rs

@@ -1,33 +1,27 @@
+use anyhow::{anyhow, Context};
 use crate::hufftree::canonical::CanonicalHufftree;
 use bit_vec::BitVec;
 use std::io::Read;
 use std::io::Write;
 
-/* So what is our file format going to look like?
- * This is a toy project, but its usage should be real. I should be able to use
- * this to compress actual text files (hence the decision of using Rust chars)
- * with Unicode.
- *
- * Here is the idea:
- *
- * 4 bytes     -> Length of the rest of file in bits.
- * n * 8 bytes -> CanonicalHufftree stored (8 bytes: (4)chars with (4)code_length)
- * 4 bytes     -> Ones to mark as delimiter.
- * m bytes     -> Compressed data, read only up to the bit specified earlier.
+/* Binary file format:
  *
+ * 4 bytes     -> Total bit length of the remaining data.
+ * n * 8 bytes -> Tree entries: (4 bytes code_length BE) + (4 bytes UTF-8 char).
+ * 4 bytes     -> Delimiter (0xFFFFFFFF).
+ * m bytes     -> Huffman-encoded text, padded to the next byte boundary.
  */
 pub fn store_tree_and_text<F: Write>(
     tree: CanonicalHufftree,
     writer: &mut F,
     text: &str,
-) -> Result<(), String> {
+) -> Result<(), anyhow::Error> {
     let mut buff = BitVec::new();
     let mut character_buff: [u8; 4] = [0; 4];
     let mut bit_length: u32 = 0;
 
-    for (character, code_length) in tree.get_character_codes_for_storage() {
-        let code_length = code_length.to_be_bytes();
-        buff.append(&mut BitVec::from_bytes(&code_length));
+    for &(character, code_length) in tree.get_character_codes_for_storage() {
+        buff.append(&mut BitVec::from_bytes(&code_length.to_be_bytes()));
         bit_length += 32;
 
         character.encode_utf8(&mut character_buff);
@@ -40,115 +34,78 @@ pub fn store_tree_and_text<F: Write>(
     buff.append(&mut BitVec::from_elem(32, true));
     bit_length += 32;
 
-    let encoded_text = tree.encode_text(text);
-
-    let text_bits: u32 = encoded_text.len().try_into().unwrap();
-
-    // println!("Bit length: {}, Text bits: {}.", bit_length, text_bits);
-    bit_length += text_bits;
+    let encoded_text = tree.encode_text(text).map_err(|e| anyhow!(e))?;
+    bit_length += encoded_text.len() as u32;
 
-    let buff = buff.to_bytes();
-    // println!("Buffer when in bytes:{:?}", buff);
-    // let buff_len: u32 = TryInto::<u32>::try_into(buff.len()).unwrap() * 8;
-
-    let encoded_text = encoded_text.to_bytes();
-    writer.write_all(&bit_length.to_be_bytes()).unwrap();
-    writer.write_all(&buff).unwrap();
-    writer.write_all(&encoded_text).unwrap();
+    writer.write_all(&bit_length.to_be_bytes())?;
+    writer.write_all(&buff.to_bytes())?;
+    writer.write_all(&encoded_text.to_bytes())?;
     Ok(())
 }
 
-pub fn read_tree_and_text<F: Read>(reader: &mut F) -> String {
-    let mut length_of_file_in_bits: [u8; 4] = [0; 4];
-
-    reader.read_exact(&mut length_of_file_in_bits).unwrap();
-
-    let mut length_of_file_in_bits: u32 = four_b_to_u32(&length_of_file_in_bits);
+pub fn read_tree_and_text<F: Read>(reader: &mut F) -> Result<String, anyhow::Error> {
+    let mut length_buf: [u8; 4] = [0; 4];
+    reader
+        .read_exact(&mut length_buf)
+        .context("Could not read file length.")?;
+    let mut remaining_bits = u32::from_be_bytes(length_buf);
 
     let mut working_vec: Vec<(char, u32)> = Vec::new();
-
-    let mut char_and_code: [u8; 8] = [0; 8];
+    let mut entry: [u8; 8] = [0; 8];
     reader
-        .read_exact(&mut char_and_code)
-        .expect("Could not read further.");
-
-    let mut c: [u8; 4] = [0; 4];
-    while char_and_code[0..4] != [255, 255, 255, 255] {
-        // println!("Char and code (start):\n{:?}\n", char_and_code);
-        c.clone_from_slice(&char_and_code[4..8]);
+        .read_exact(&mut entry)
+        .context("Could not read first tree entry.")?;
 
-        // println!("Character: {:?}", c);
-        let c: String = String::from_utf8(Vec::from(c)).expect("Corrupted data 🪳");
-        // There should only be one character per 4 bytes.
-        let c = c.chars().next().expect("Corrupted data 🪳");
-        // let code = BitVec::from_bytes(&char_and_code[0..4]);
-        let mut code = [0; 4];
-        code.clone_from_slice(&char_and_code[0..4]);
-        // println!("Character: {:?}", c);
+    while entry[0..4] != [255, 255, 255, 255] {
+        let code_length = u32::from_be_bytes(entry[0..4].try_into().unwrap());
 
-        working_vec.push((c, four_b_to_u32(&code)));
+        let mut char_buf = [0u8; 4];
+        char_buf.copy_from_slice(&entry[4..8]);
+        let c = String::from_utf8(char_buf.to_vec())
+            .context("Corrupted tree entry: invalid UTF-8.")?;
+        let c = c
+            .chars()
+            .next()
+            .ok_or_else(|| anyhow!("Corrupted tree entry: empty character."))?;
 
-        length_of_file_in_bits -= 64;
+        working_vec.push((c, code_length));
+        remaining_bits -= 64;
 
-        // For small encodings
-        if length_of_file_in_bits <= 64 {
+        if remaining_bits <= 64 {
             break;
         }
         reader
-            .read_exact(&mut char_and_code)
-            .expect("Could not read further.");
-        // println!("Char and code:\n{:?}\n", char_and_code);
-    }
-    // println!("Char and code:\n{:?}\n", char_and_code);
-    length_of_file_in_bits -= 32;
-
-    // println!("Length of file remaining: {}", length_of_file_in_bits);
-
-    if length_of_file_in_bits <= 32 {
-        let mut rest_of_binary = Vec::new();
-        reader
-            .read_to_end(&mut rest_of_binary)
-            .expect("Could not read data to end.");
-        // println!("Rest of binary: {:?}", rest_of_binary);
-        let rest_of_binary = &rest_of_binary[4..];
-        // println!("Rest of binary: {:?}", rest_of_binary);
-
-        let mut bits = BitVec::from_bytes(rest_of_binary);
-        bits.split_off(length_of_file_in_bits as usize);
-        // println!("Bit vec: {:?}", bits);
-
-        let can_tree = CanonicalHufftree::from_vec(working_vec);
-        return can_tree.decode_text(bits).unwrap();
+            .read_exact(&mut entry)
+            .context("Could not read tree entry.")?;
     }
 
-    let mut encoded_text = BitVec::from_bytes(&char_and_code[4..8]);
-    let mut rest_of_encoded_text = Vec::new();
-    reader
-        .read_to_end(&mut rest_of_encoded_text)
-        .expect("Could not read till EOF.");
-
-    let mut rest_of_encoded_text = BitVec::from_bytes(&rest_of_encoded_text);
-    // println!("Bit vec: {:?}", rest_of_encoded_text);
-    encoded_text.append(&mut rest_of_encoded_text);
-    encoded_text.split_off(length_of_file_in_bits as usize);
+    remaining_bits -= 32; // delimiter
 
     let can_tree = CanonicalHufftree::from_vec(working_vec);
-    can_tree.decode_text(encoded_text).unwrap()
-}
 
-fn four_b_to_u32(b: &[u8; 4]) -> u32 {
-    let mut result: u32 = 0;
+    if remaining_bits <= 32 {
+        let mut rest = Vec::new();
+        reader
+            .read_to_end(&mut rest)
+            .context("Could not read encoded data.")?;
+        let rest = &rest[4..]; // skip the portion of the delimiter that was in `entry`
 
-    for (i, bt) in b.iter().enumerate() {
-        let bt32 = *bt as u32;
-        result += bt32;
+        let mut bits = BitVec::from_bytes(rest);
+        bits.split_off(remaining_bits as usize);
 
-        if i != 3 {
-            result <<= 8;
-        }
+        return can_tree.decode_text(bits).map_err(|e| anyhow!(e));
     }
 
-    result
+    let mut encoded_text = BitVec::from_bytes(&entry[4..8]);
+    let mut rest = Vec::new();
+    reader
+        .read_to_end(&mut rest)
+        .context("Could not read encoded data.")?;
+    let mut rest_bits = BitVec::from_bytes(&rest);
+    encoded_text.append(&mut rest_bits);
+    encoded_text.split_off(remaining_bits as usize);
+
+    can_tree.decode_text(encoded_text).map_err(|e| anyhow!(e))
 }
 
 #[cfg(test)]
@@ -173,27 +130,9 @@ mod test {
         let mut virtual_buffer = Vec::new();
         store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
 
-        // println!("Buffer:{:?}", virtual_buffer);
-        assert_eq!(&virtual_buffer[0..4], &[0, 0, 0, 233]); // Length of tree + encoded text.
-                                                            // 0,0,0,1, //  Code length of 'a'
-                                                            // 97,0,0,0, // 'a'
-                                                            // 0,0,0,2, //  Code length of 'b'
-                                                            // 98,0,0,0, // 'b'
-                                                            // 0,0,0,2, //  Code length of 'c'
-                                                            // 99,0,0,0, // 'c'
-                                                            // 255,255,255,255, // Delimiter
-                                                            // 21, 128, // Encoded text.
-                                                            // ]
+        assert_eq!(&virtual_buffer[0..4], &[0, 0, 0, 233]);
         let size = virtual_buffer.len();
         assert_eq!(&virtual_buffer[(size - 4)..size], &[255, 255, 21, 128]);
-        // )
-    }
-
-    #[test]
-    fn convert_array_to_u32() {
-        let two_hundred_fifty_seven: [u8; 4] = [0, 0, 1, 1];
-        let as_num = four_b_to_u32(&two_hundred_fifty_seven);
-        assert_eq!(as_num, 257u32);
     }
 
     #[test]
@@ -211,12 +150,9 @@ mod test {
         let mut virtual_buffer = Vec::new();
         store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
 
-        // println!("Virtual buffer: {:?}", virtual_buffer);
-
-        let decoded_text = read_tree_and_text(&mut &virtual_buffer[0..virtual_buffer.len()]);
+        let decoded_text = read_tree_and_text(&mut &virtual_buffer[..]).unwrap();
 
         assert_eq!(decoded_text, input_text);
-        // println!("Decoded text: {}\nInput text:{}", decoded_text, input_text);
     }
 
     #[test]
@@ -234,7 +170,7 @@ mod test {
         let mut virtual_buffer = Vec::new();
         store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
 
-        let decoded_text = read_tree_and_text(&mut &virtual_buffer[0..virtual_buffer.len()]);
+        let decoded_text = read_tree_and_text(&mut &virtual_buffer[..]).unwrap();
 
         assert_eq!(decoded_text, input_text);
     }
@@ -254,11 +190,8 @@ mod test {
         let mut virtual_buffer = Vec::new();
         store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
 
-        // println!("Virtual buffer: {:?}", virtual_buffer);
-
-        let decoded_text = read_tree_and_text(&mut &virtual_buffer[0..virtual_buffer.len()]);
+        let decoded_text = read_tree_and_text(&mut &virtual_buffer[..]).unwrap();
 
         assert_eq!(decoded_text, input_text);
-        // println!("Decoded text: {}\nInput text:{}", decoded_text, input_text);
     }
 }