|
|
@@ -1,33 +1,27 @@
|
|
|
+use anyhow::{anyhow, Context};
|
|
|
use crate::hufftree::canonical::CanonicalHufftree;
|
|
|
use bit_vec::BitVec;
|
|
|
use std::io::Read;
|
|
|
use std::io::Write;
|
|
|
|
|
|
-/* So what is our file format going to look like?
|
|
|
- * This is a toy project, but its usage should be real. I should be able to use
|
|
|
- * this to compress actual text files (hence the decision of using Rust chars)
|
|
|
- * with Unicode.
|
|
|
- *
|
|
|
- * Here is the idea:
|
|
|
- *
|
|
|
- * 4 bytes -> Length of the rest of file in bits.
|
|
|
- * n * 8 bytes -> CanonicalHufftree stored (8 bytes: (4)chars with (4)code_length)
|
|
|
- * 4 bytes -> Ones to mark as delimiter.
|
|
|
- * m bytes -> Compressed data, read only up to the bit specified earlier.
|
|
|
+/* Binary file format:
|
|
|
*
|
|
|
+ * 4 bytes -> Total bit length of the remaining data.
|
|
|
+ * n * 8 bytes -> Tree entries: (4 bytes code_length BE) + (4 bytes UTF-8 char).
|
|
|
+ * 4 bytes -> Delimiter (0xFFFFFFFF).
|
|
|
+ * m bytes -> Huffman-encoded text, padded to the next byte boundary.
|
|
|
*/
|
|
|
pub fn store_tree_and_text<F: Write>(
|
|
|
tree: CanonicalHufftree,
|
|
|
writer: &mut F,
|
|
|
text: &str,
|
|
|
-) -> Result<(), String> {
|
|
|
+) -> Result<(), anyhow::Error> {
|
|
|
let mut buff = BitVec::new();
|
|
|
let mut character_buff: [u8; 4] = [0; 4];
|
|
|
let mut bit_length: u32 = 0;
|
|
|
|
|
|
- for (character, code_length) in tree.get_character_codes_for_storage() {
|
|
|
- let code_length = code_length.to_be_bytes();
|
|
|
- buff.append(&mut BitVec::from_bytes(&code_length));
|
|
|
+ for &(character, code_length) in tree.get_character_codes_for_storage() {
|
|
|
+ buff.append(&mut BitVec::from_bytes(&code_length.to_be_bytes()));
|
|
|
bit_length += 32;
|
|
|
|
|
|
character.encode_utf8(&mut character_buff);
|
|
|
@@ -40,115 +34,78 @@ pub fn store_tree_and_text<F: Write>(
|
|
|
buff.append(&mut BitVec::from_elem(32, true));
|
|
|
bit_length += 32;
|
|
|
|
|
|
- let encoded_text = tree.encode_text(text);
|
|
|
-
|
|
|
- let text_bits: u32 = encoded_text.len().try_into().unwrap();
|
|
|
-
|
|
|
- // println!("Bit length: {}, Text bits: {}.", bit_length, text_bits);
|
|
|
- bit_length += text_bits;
|
|
|
+ let encoded_text = tree.encode_text(text).map_err(|e| anyhow!(e))?;
|
|
|
+ bit_length += encoded_text.len() as u32;
|
|
|
|
|
|
- let buff = buff.to_bytes();
|
|
|
- // println!("Buffer when in bytes:{:?}", buff);
|
|
|
- // let buff_len: u32 = TryInto::<u32>::try_into(buff.len()).unwrap() * 8;
|
|
|
-
|
|
|
- let encoded_text = encoded_text.to_bytes();
|
|
|
- writer.write_all(&bit_length.to_be_bytes()).unwrap();
|
|
|
- writer.write_all(&buff).unwrap();
|
|
|
- writer.write_all(&encoded_text).unwrap();
|
|
|
+ writer.write_all(&bit_length.to_be_bytes())?;
|
|
|
+ writer.write_all(&buff.to_bytes())?;
|
|
|
+ writer.write_all(&encoded_text.to_bytes())?;
|
|
|
Ok(())
|
|
|
}
|
|
|
|
|
|
-pub fn read_tree_and_text<F: Read>(reader: &mut F) -> String {
|
|
|
- let mut length_of_file_in_bits: [u8; 4] = [0; 4];
|
|
|
-
|
|
|
- reader.read_exact(&mut length_of_file_in_bits).unwrap();
|
|
|
-
|
|
|
- let mut length_of_file_in_bits: u32 = four_b_to_u32(&length_of_file_in_bits);
|
|
|
+pub fn read_tree_and_text<F: Read>(reader: &mut F) -> Result<String, anyhow::Error> {
|
|
|
+ let mut length_buf: [u8; 4] = [0; 4];
|
|
|
+ reader
|
|
|
+ .read_exact(&mut length_buf)
|
|
|
+ .context("Could not read file length.")?;
|
|
|
+ let mut remaining_bits = u32::from_be_bytes(length_buf);
|
|
|
|
|
|
let mut working_vec: Vec<(char, u32)> = Vec::new();
|
|
|
-
|
|
|
- let mut char_and_code: [u8; 8] = [0; 8];
|
|
|
+ let mut entry: [u8; 8] = [0; 8];
|
|
|
reader
|
|
|
- .read_exact(&mut char_and_code)
|
|
|
- .expect("Could not read further.");
|
|
|
-
|
|
|
- let mut c: [u8; 4] = [0; 4];
|
|
|
- while char_and_code[0..4] != [255, 255, 255, 255] {
|
|
|
- // println!("Char and code (start):\n{:?}\n", char_and_code);
|
|
|
- c.clone_from_slice(&char_and_code[4..8]);
|
|
|
+ .read_exact(&mut entry)
|
|
|
+ .context("Could not read first tree entry.")?;
|
|
|
|
|
|
- // println!("Character: {:?}", c);
|
|
|
- let c: String = String::from_utf8(Vec::from(c)).expect("Corrupted data 🪳");
|
|
|
- // There should only be one character per 4 bytes.
|
|
|
- let c = c.chars().next().expect("Corrupted data 🪳");
|
|
|
- // let code = BitVec::from_bytes(&char_and_code[0..4]);
|
|
|
- let mut code = [0; 4];
|
|
|
- code.clone_from_slice(&char_and_code[0..4]);
|
|
|
- // println!("Character: {:?}", c);
|
|
|
+ while entry[0..4] != [255, 255, 255, 255] {
|
|
|
+ let code_length = u32::from_be_bytes(entry[0..4].try_into().unwrap());
|
|
|
|
|
|
- working_vec.push((c, four_b_to_u32(&code)));
|
|
|
+ let mut char_buf = [0u8; 4];
|
|
|
+ char_buf.copy_from_slice(&entry[4..8]);
|
|
|
+ let c = String::from_utf8(char_buf.to_vec())
|
|
|
+ .context("Corrupted tree entry: invalid UTF-8.")?;
|
|
|
+ let c = c
|
|
|
+ .chars()
|
|
|
+ .next()
|
|
|
+ .ok_or_else(|| anyhow!("Corrupted tree entry: empty character."))?;
|
|
|
|
|
|
- length_of_file_in_bits -= 64;
|
|
|
+ working_vec.push((c, code_length));
|
|
|
+ remaining_bits -= 64;
|
|
|
|
|
|
- // For small encodings
|
|
|
- if length_of_file_in_bits <= 64 {
|
|
|
+ if remaining_bits <= 64 {
|
|
|
break;
|
|
|
}
|
|
|
reader
|
|
|
- .read_exact(&mut char_and_code)
|
|
|
- .expect("Could not read further.");
|
|
|
- // println!("Char and code:\n{:?}\n", char_and_code);
|
|
|
- }
|
|
|
- // println!("Char and code:\n{:?}\n", char_and_code);
|
|
|
- length_of_file_in_bits -= 32;
|
|
|
-
|
|
|
- // println!("Length of file remaining: {}", length_of_file_in_bits);
|
|
|
-
|
|
|
- if length_of_file_in_bits <= 32 {
|
|
|
- let mut rest_of_binary = Vec::new();
|
|
|
- reader
|
|
|
- .read_to_end(&mut rest_of_binary)
|
|
|
- .expect("Could not read data to end.");
|
|
|
- // println!("Rest of binary: {:?}", rest_of_binary);
|
|
|
- let rest_of_binary = &rest_of_binary[4..];
|
|
|
- // println!("Rest of binary: {:?}", rest_of_binary);
|
|
|
-
|
|
|
- let mut bits = BitVec::from_bytes(rest_of_binary);
|
|
|
- bits.split_off(length_of_file_in_bits as usize);
|
|
|
- // println!("Bit vec: {:?}", bits);
|
|
|
-
|
|
|
- let can_tree = CanonicalHufftree::from_vec(working_vec);
|
|
|
- return can_tree.decode_text(bits).unwrap();
|
|
|
+ .read_exact(&mut entry)
|
|
|
+ .context("Could not read tree entry.")?;
|
|
|
}
|
|
|
|
|
|
- let mut encoded_text = BitVec::from_bytes(&char_and_code[4..8]);
|
|
|
- let mut rest_of_encoded_text = Vec::new();
|
|
|
- reader
|
|
|
- .read_to_end(&mut rest_of_encoded_text)
|
|
|
- .expect("Could not read till EOF.");
|
|
|
-
|
|
|
- let mut rest_of_encoded_text = BitVec::from_bytes(&rest_of_encoded_text);
|
|
|
- // println!("Bit vec: {:?}", rest_of_encoded_text);
|
|
|
- encoded_text.append(&mut rest_of_encoded_text);
|
|
|
- encoded_text.split_off(length_of_file_in_bits as usize);
|
|
|
+ remaining_bits -= 32; // delimiter
|
|
|
|
|
|
let can_tree = CanonicalHufftree::from_vec(working_vec);
|
|
|
- can_tree.decode_text(encoded_text).unwrap()
|
|
|
-}
|
|
|
|
|
|
-fn four_b_to_u32(b: &[u8; 4]) -> u32 {
|
|
|
- let mut result: u32 = 0;
|
|
|
+ if remaining_bits <= 32 {
|
|
|
+ let mut rest = Vec::new();
|
|
|
+ reader
|
|
|
+ .read_to_end(&mut rest)
|
|
|
+ .context("Could not read encoded data.")?;
|
|
|
+ let rest = &rest[4..]; // skip the portion of the delimiter that was in `entry`
|
|
|
|
|
|
- for (i, bt) in b.iter().enumerate() {
|
|
|
- let bt32 = *bt as u32;
|
|
|
- result += bt32;
|
|
|
+ let mut bits = BitVec::from_bytes(rest);
|
|
|
+ bits.split_off(remaining_bits as usize);
|
|
|
|
|
|
- if i != 3 {
|
|
|
- result <<= 8;
|
|
|
- }
|
|
|
+ return can_tree.decode_text(bits).map_err(|e| anyhow!(e));
|
|
|
}
|
|
|
|
|
|
- result
|
|
|
+ let mut encoded_text = BitVec::from_bytes(&entry[4..8]);
|
|
|
+ let mut rest = Vec::new();
|
|
|
+ reader
|
|
|
+ .read_to_end(&mut rest)
|
|
|
+ .context("Could not read encoded data.")?;
|
|
|
+ let mut rest_bits = BitVec::from_bytes(&rest);
|
|
|
+ encoded_text.append(&mut rest_bits);
|
|
|
+ encoded_text.split_off(remaining_bits as usize);
|
|
|
+
|
|
|
+ can_tree.decode_text(encoded_text).map_err(|e| anyhow!(e))
|
|
|
}
|
|
|
|
|
|
#[cfg(test)]
|
|
|
@@ -173,27 +130,9 @@ mod test {
|
|
|
let mut virtual_buffer = Vec::new();
|
|
|
store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
|
|
|
|
|
|
- // println!("Buffer:{:?}", virtual_buffer);
|
|
|
- assert_eq!(&virtual_buffer[0..4], &[0, 0, 0, 233]); // Length of tree + encoded text.
|
|
|
- // 0,0,0,1, // Code length of 'a'
|
|
|
- // 97,0,0,0, // 'a'
|
|
|
- // 0,0,0,2, // Code length of 'b'
|
|
|
- // 98,0,0,0, // 'b'
|
|
|
- // 0,0,0,2, // Code length of 'c'
|
|
|
- // 99,0,0,0, // 'c'
|
|
|
- // 255,255,255,255, // Delimiter
|
|
|
- // 21, 128, // Encoded text.
|
|
|
- // ]
|
|
|
+ assert_eq!(&virtual_buffer[0..4], &[0, 0, 0, 233]);
|
|
|
let size = virtual_buffer.len();
|
|
|
assert_eq!(&virtual_buffer[(size - 4)..size], &[255, 255, 21, 128]);
|
|
|
- // )
|
|
|
- }
|
|
|
-
|
|
|
- #[test]
|
|
|
- fn convert_array_to_u32() {
|
|
|
- let two_hundred_fifty_seven: [u8; 4] = [0, 0, 1, 1];
|
|
|
- let as_num = four_b_to_u32(&two_hundred_fifty_seven);
|
|
|
- assert_eq!(as_num, 257u32);
|
|
|
}
|
|
|
|
|
|
#[test]
|
|
|
@@ -211,12 +150,9 @@ mod test {
|
|
|
let mut virtual_buffer = Vec::new();
|
|
|
store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
|
|
|
|
|
|
- // println!("Virtual buffer: {:?}", virtual_buffer);
|
|
|
-
|
|
|
- let decoded_text = read_tree_and_text(&mut &virtual_buffer[0..virtual_buffer.len()]);
|
|
|
+ let decoded_text = read_tree_and_text(&mut &virtual_buffer[..]).unwrap();
|
|
|
|
|
|
assert_eq!(decoded_text, input_text);
|
|
|
- // println!("Decoded text: {}\nInput text:{}", decoded_text, input_text);
|
|
|
}
|
|
|
|
|
|
#[test]
|
|
|
@@ -234,7 +170,7 @@ mod test {
|
|
|
let mut virtual_buffer = Vec::new();
|
|
|
store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
|
|
|
|
|
|
- let decoded_text = read_tree_and_text(&mut &virtual_buffer[0..virtual_buffer.len()]);
|
|
|
+ let decoded_text = read_tree_and_text(&mut &virtual_buffer[..]).unwrap();
|
|
|
|
|
|
assert_eq!(decoded_text, input_text);
|
|
|
}
|
|
|
@@ -254,11 +190,8 @@ mod test {
|
|
|
let mut virtual_buffer = Vec::new();
|
|
|
store_tree_and_text(canonical, &mut virtual_buffer, &input_text).unwrap();
|
|
|
|
|
|
- // println!("Virtual buffer: {:?}", virtual_buffer);
|
|
|
-
|
|
|
- let decoded_text = read_tree_and_text(&mut &virtual_buffer[0..virtual_buffer.len()]);
|
|
|
+ let decoded_text = read_tree_and_text(&mut &virtual_buffer[..]).unwrap();
|
|
|
|
|
|
assert_eq!(decoded_text, input_text);
|
|
|
- // println!("Decoded text: {}\nInput text:{}", decoded_text, input_text);
|
|
|
}
|
|
|
}
|