From 8945366082c7904e7188c0b4c0caad7592236e35 Mon Sep 17 00:00:00 2001 From: yukirij Date: Fri, 8 Nov 2024 14:46:47 -0800 Subject: [PATCH] Initial implementation of TrieFile. --- src/bin/test.rs | 31 +++- src/lib.rs | 2 +- src/triefile/mod.rs | 401 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 431 insertions(+), 3 deletions(-) create mode 100644 src/triefile/mod.rs diff --git a/src/bin/test.rs b/src/bin/test.rs index b5e7585..582fbd3 100644 --- a/src/bin/test.rs +++ b/src/bin/test.rs @@ -1,8 +1,11 @@ -use storage::BlockFile; +//use storage::BlockFile; +use storage::TrieFile; fn main() { - if let Ok(mut bf) = BlockFile::<128>::open("data/cache.bin") { + std::fs::create_dir_all("data").ok(); + + /*if let Ok(mut bf) = BlockFile::<128>::open("data/cache_data.bin") { for i in 0..760 { if let Ok(id) = bf.insert(format!("Hello, world! {}. This is long text to increase the block size to sufficient length to roll over into a second block when using smaller block sizes.", i).as_bytes()) { @@ -13,5 +16,29 @@ fn main() } } else { println!("Failed to open."); + }*/ + + if let Ok(mut tf) = TrieFile::<8>::open("data/cache_index.bin") { + + for s in [ + "Hello", + "Hello!", + "Goodbye", + "Greatings", + "Good day", + "Regards", + ] { + println!("# insert {}", s); + if tf.set(s.as_bytes(), s.as_bytes()).is_err() { + println!("Failed to insert '{}'.", s); + } + + if let Some(data) = tf.get(s.as_bytes()).unwrap() { + println!("found '{}'.", String::from_utf8(data).unwrap()); + } + } + + } else { + println!("Failed to open index."); } } diff --git a/src/lib.rs b/src/lib.rs index fa1f756..372e333 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ #![allow(dead_code)] mod blockfile; pub use blockfile::BlockFile; -//mod triefile; pub use triefile::TrieFile; +mod triefile; pub use triefile::TrieFile; diff --git a/src/triefile/mod.rs b/src/triefile/mod.rs new file mode 100644 index 0000000..72d6545 --- /dev/null +++ b/src/triefile/mod.rs @@ -0,0 +1,401 @@ +/* +** File-based trie. +** +** [Record] +** +** +** +*/ + +use pack::prelude::Pack; + +use std::{ + fs::File, + io::{Read, Seek, SeekFrom, Write}, + path::Path, +}; + +struct Node { + length:usize, + bytes:[u8; 15], + next:u32, + child:u32, + has_data:bool, + data:[u8; Z], +} +impl Node { + fn new() -> Self + { + Self { + length:0, + bytes:[0; 15], + next:0, + child:0, + has_data:false, + data:[0; Z], + } + } + + fn encode(&self) -> Vec + { + let mut flags = 0u8; + flags |= (self.has_data as u8) << 7; + flags |= self.length as u8; + + [ + flags.pack(), + self.bytes.to_vec(), + self.next.pack(), + self.child.pack(), + self.data.to_vec(), + ].concat() + } + + fn decode(&mut self, data:&[u8], index:&mut usize) -> Result<(),()> + { + if data.len() == Self::len() { + let flags = u8::unpack(data, index)?; + self.length = (flags & 0x0F) as usize; + self.has_data = (flags & 0x80) != 0; + + for i in 0..15 { + self.bytes[i] = data[*index]; + *index += 1; + } + + self.next = u32::unpack(data, index)?; + self.child = u32::unpack(data, index)?; + + for i in 0..Z { + self.data[i] = data[*index]; + *index += 1; + } + + Ok(()) + } else { + Err(()) + } + } + + const fn len() -> usize + { + 24 + Z + } +} + +pub struct TrieFile { + file:File, +} +impl TrieFile { + pub fn open>(path:P) -> Result + { + match File::options() + .create(true) + .read(true) + .write(true) + .open(path) { + Ok(file) => { + Ok(Self { + file, + }) + } + Err(error) => Err(error), + } + } + + pub fn set(&mut self, key:&[u8], data:&[u8]) -> Result<(),std::io::Error> + { + let mut node = Node::::new(); + + let mut node_index = 0; + let mut key_index = 0; + + // Allocate first chain of nodes if none exist. + if self.block_count()? == 0 { + //println!("originate"); + self.trailing_nodes(key, &mut key_index, data)?; + return Ok(()); + } else { + //println!("traverse"); + + // Traverse nodes until key is found. + while key_index < key.len() { + //println!("start k {}/{}", key_index, key.len()); + + self.read_node(node_index, &mut node)?; + + // If node shares prefix with key... + if node.bytes[0] == key[key_index] { + + // Count length of shared prefix + let mut prefix_index = 0; + + while prefix_index < node.length + && (key_index + prefix_index) < key.len() + && key[key_index + prefix_index] == node.bytes[prefix_index] { + prefix_index += 1; + } + + //println!("prefix {}", prefix_index); + + if prefix_index == node.length { + key_index += prefix_index; + + // Copy data to node, mark as occupied, and write to file. + if key_index == key.len() { + //println!(" - found node"); + + node.has_data = true; + node.data = [0; Z]; + + for i in 0..Z.min(data.len()) { + node.data[i] = data[i]; + } + + self.write_node(node_index, &node)?; + } + + // Continue to child node. + else { + if node.child != 0 { + //println!(" - child"); + + node_index = node.child; + } + + // Create new child node chain if none exists. + else { + //println!(" - new child"); + + node.child = self.trailing_nodes(key, &mut key_index, data)?; + self.write_node(node_index, &node)?; + } + } + + } else { + //println!(" - split"); + + // Split node into one parent and two children, preserving parent block id. + + key_index += prefix_index; + + let prefix = node.bytes[0..prefix_index].to_vec(); + let suffix = node.bytes[prefix_index..].to_vec(); + + // + node.length = prefix.len(); + for i in prefix_index..node.bytes.len() { + node.bytes[i] = 0; + } + + let child_index = self.allocate()?; + + let mut child_node = Node::::new(); + child_node.length = suffix.len(); + for i in 0..15.min(suffix.len()) { + child_node.bytes[i] = suffix[i]; + } + + child_node.child = node.child; + node.child = child_index; + + // Move data to child node. + child_node.has_data = node.has_data; + child_node.data = node.data; + node.has_data = false; + node.data.fill(0); + + // Write data to trailing nodes. + if key_index < key.len() { + let new_branch = self.trailing_nodes(key, &mut key_index, data)?; + child_node.next = new_branch; + } + + // Write data to current node. + else { + node.has_data = true; + for i in 0..Z.min(data.len()) { + node.data[i] = data[i]; + } + } + + self.write_node(node_index, &node)?; + self.write_node(child_index, &child_node)?; + } + } + + // If node does not share prefix with key... + else { + // Move to or create new next node. + if node.next != 0 { + //println!(" - next"); + + // Move to next node. + node_index = node.next; + } else { + //println!(" - new next"); + + // Allocate and initialize subsequent nodes until key is resolved. + node.next = self.trailing_nodes(key, &mut key_index, data)?; + self.write_node(node_index, &node)?; + } + } + } + } + + Ok(()) + } + + pub fn get(&self, key:&[u8]) -> Result>, std::io::Error> + { + let mut node = Node::::new(); + + let mut node_index = 0; + let mut key_index = 0; + + if self.block_count()? != 0 { + + // Traverse nodes until key is found. + while key_index < key.len() { + //println!("start k {}/{}", key_index, key.len()); + + self.read_node(node_index, &mut node)?; + + // If node shares prefix with key... + if node.bytes[0] == key[key_index] { + + // Count length of shared prefix + let mut prefix_index = 0; + + while prefix_index < node.length + && (key_index + prefix_index) < key.len() + && key[key_index + prefix_index] == node.bytes[prefix_index] { + prefix_index += 1; + } + + //println!("prefix {}", prefix_index); + + if prefix_index == node.length { + key_index += prefix_index; + + // Return node contents. + if key_index == key.len() { + //println!(" - found node"); + + return Ok(Some(node.data.to_vec())); + } + + // Continue to child node. + else { + if node.child != 0 { + //println!(" - child"); + + node_index = node.child; + } else { + return Ok(None); + } + } + + } else { + return Ok(None); + } + } + + // If node does not share prefix with key... + else { + // Move to or create new next node. + if node.next != 0 { + //println!(" - next"); + + // Move to next node. + node_index = node.next; + } else { + return Ok(None); + } + } + } + } + + Ok(None) + } + + /*pub fn unset(&self, _key:&[u8]) -> Result<(), std::io::Error> + { + Ok(()) + }*/ + + fn trailing_nodes(&mut self, key:&[u8], key_index:&mut usize, data:&[u8]) -> Result + { + let starting_node = self.allocate()?; + let mut node :Node; + let mut node_index = starting_node; + + while *key_index < key.len() { + node = Node::::new(); + + // Copy key bytes to node. + for i in 0..15.min(key.len() - *key_index) { + node.bytes[i] = key[*key_index]; + node.length += 1; + *key_index += 1; + } + + // Allocate child node if key byte remain. + if *key_index < key.len() { + node.child = self.allocate()?; + } + + // Otherwise, write data to node. + else { + node.has_data = true; + for i in 0..Z.min(data.len()) { + node.data[i] = data[i]; + } + } + + self.write_node(node_index, &node)?; + node_index = node.child; + } + + Ok(starting_node) + } + + fn read_node(&self, index:u32, node:&mut Node) -> Result<(), std::io::Error> + { + let mut file = self.file.try_clone()?; + + let mut data = vec![0u8; Self::block_size()]; + + file.seek(SeekFrom::Start((Self::block_size() * index as usize) as u64))?; + file.read_exact(&mut data)?; + + node.decode(&data, &mut 0).map_err(|_| std::io::Error::other("failed to decode node block"))?; + + Ok(()) + } + + fn write_node(&mut self, index:u32, node:&Node) -> Result<(), std::io::Error> + { + self.file.seek(SeekFrom::Start((Self::block_size() * index as usize) as u64))?; + self.file.write(&node.encode())?; + Ok(()) + } + + fn allocate(&mut self) -> Result + { + let block_id = ((self.file.seek(SeekFrom::End(0))? as usize) / Self::block_size()) as u32; + self.file.write(&vec![0u8; Self::block_size()])?; + Ok(block_id) + } + + fn block_count(&self) -> Result + { + let mut file = self.file.try_clone()?; + Ok(((file.seek(SeekFrom::End(0))? as usize) / Self::block_size()) as u32) + } + + const fn block_size() -> usize + { + Node::::len() + } +}