Initial implementation of TrieFile.

This commit is contained in:
yukirij 2024-11-08 14:46:47 -08:00
parent 4ad9095296
commit 8945366082
3 changed files with 431 additions and 3 deletions

View File

@ -1,8 +1,11 @@
use storage::BlockFile; //use storage::BlockFile;
use storage::TrieFile;
fn main() fn main()
{ {
if let Ok(mut bf) = BlockFile::<128>::open("data/cache.bin") { std::fs::create_dir_all("data").ok();
/*if let Ok(mut bf) = BlockFile::<128>::open("data/cache_data.bin") {
for i in 0..760 { for i in 0..760 {
if let Ok(id) = bf.insert(format!("Hello, world! {}. This is long text to increase the block size to sufficient length to roll over into a second block when using smaller block sizes.", i).as_bytes()) { if let Ok(id) = bf.insert(format!("Hello, world! {}. This is long text to increase the block size to sufficient length to roll over into a second block when using smaller block sizes.", i).as_bytes()) {
@ -13,5 +16,29 @@ fn main()
} }
} else { } else {
println!("Failed to open."); println!("Failed to open.");
}*/
if let Ok(mut tf) = TrieFile::<8>::open("data/cache_index.bin") {
for s in [
"Hello",
"Hello!",
"Goodbye",
"Greatings",
"Good day",
"Regards",
] {
println!("# insert {}", s);
if tf.set(s.as_bytes(), s.as_bytes()).is_err() {
println!("Failed to insert '{}'.", s);
}
if let Some(data) = tf.get(s.as_bytes()).unwrap() {
println!("found '{}'.", String::from_utf8(data).unwrap());
}
}
} else {
println!("Failed to open index.");
} }
} }

View File

@ -1,4 +1,4 @@
#![allow(dead_code)] #![allow(dead_code)]
mod blockfile; pub use blockfile::BlockFile; mod blockfile; pub use blockfile::BlockFile;
//mod triefile; pub use triefile::TrieFile; mod triefile; pub use triefile::TrieFile;

401
src/triefile/mod.rs Normal file
View File

@ -0,0 +1,401 @@
/*
** File-based trie.
**
** [Record]
** <Flags/Length:1> <Prefix:15>
** <Next:4> <Child:4>
** <Data:Z>
*/
use pack::prelude::Pack;
use std::{
fs::File,
io::{Read, Seek, SeekFrom, Write},
path::Path,
};
struct Node<const Z:usize> {
length:usize,
bytes:[u8; 15],
next:u32,
child:u32,
has_data:bool,
data:[u8; Z],
}
impl<const Z:usize> Node<Z> {
fn new() -> Self
{
Self {
length:0,
bytes:[0; 15],
next:0,
child:0,
has_data:false,
data:[0; Z],
}
}
fn encode(&self) -> Vec<u8>
{
let mut flags = 0u8;
flags |= (self.has_data as u8) << 7;
flags |= self.length as u8;
[
flags.pack(),
self.bytes.to_vec(),
self.next.pack(),
self.child.pack(),
self.data.to_vec(),
].concat()
}
fn decode(&mut self, data:&[u8], index:&mut usize) -> Result<(),()>
{
if data.len() == Self::len() {
let flags = u8::unpack(data, index)?;
self.length = (flags & 0x0F) as usize;
self.has_data = (flags & 0x80) != 0;
for i in 0..15 {
self.bytes[i] = data[*index];
*index += 1;
}
self.next = u32::unpack(data, index)?;
self.child = u32::unpack(data, index)?;
for i in 0..Z {
self.data[i] = data[*index];
*index += 1;
}
Ok(())
} else {
Err(())
}
}
const fn len() -> usize
{
24 + Z
}
}
pub struct TrieFile<const Z:usize> {
file:File,
}
impl<const Z:usize> TrieFile<Z> {
pub fn open<P:AsRef<Path>>(path:P) -> Result<Self, std::io::Error>
{
match File::options()
.create(true)
.read(true)
.write(true)
.open(path) {
Ok(file) => {
Ok(Self {
file,
})
}
Err(error) => Err(error),
}
}
pub fn set(&mut self, key:&[u8], data:&[u8]) -> Result<(),std::io::Error>
{
let mut node = Node::<Z>::new();
let mut node_index = 0;
let mut key_index = 0;
// Allocate first chain of nodes if none exist.
if self.block_count()? == 0 {
//println!("originate");
self.trailing_nodes(key, &mut key_index, data)?;
return Ok(());
} else {
//println!("traverse");
// Traverse nodes until key is found.
while key_index < key.len() {
//println!("start k {}/{}", key_index, key.len());
self.read_node(node_index, &mut node)?;
// If node shares prefix with key...
if node.bytes[0] == key[key_index] {
// Count length of shared prefix
let mut prefix_index = 0;
while prefix_index < node.length
&& (key_index + prefix_index) < key.len()
&& key[key_index + prefix_index] == node.bytes[prefix_index] {
prefix_index += 1;
}
//println!("prefix {}", prefix_index);
if prefix_index == node.length {
key_index += prefix_index;
// Copy data to node, mark as occupied, and write to file.
if key_index == key.len() {
//println!(" - found node");
node.has_data = true;
node.data = [0; Z];
for i in 0..Z.min(data.len()) {
node.data[i] = data[i];
}
self.write_node(node_index, &node)?;
}
// Continue to child node.
else {
if node.child != 0 {
//println!(" - child");
node_index = node.child;
}
// Create new child node chain if none exists.
else {
//println!(" - new child");
node.child = self.trailing_nodes(key, &mut key_index, data)?;
self.write_node(node_index, &node)?;
}
}
} else {
//println!(" - split");
// Split node into one parent and two children, preserving parent block id.
key_index += prefix_index;
let prefix = node.bytes[0..prefix_index].to_vec();
let suffix = node.bytes[prefix_index..].to_vec();
//
node.length = prefix.len();
for i in prefix_index..node.bytes.len() {
node.bytes[i] = 0;
}
let child_index = self.allocate()?;
let mut child_node = Node::<Z>::new();
child_node.length = suffix.len();
for i in 0..15.min(suffix.len()) {
child_node.bytes[i] = suffix[i];
}
child_node.child = node.child;
node.child = child_index;
// Move data to child node.
child_node.has_data = node.has_data;
child_node.data = node.data;
node.has_data = false;
node.data.fill(0);
// Write data to trailing nodes.
if key_index < key.len() {
let new_branch = self.trailing_nodes(key, &mut key_index, data)?;
child_node.next = new_branch;
}
// Write data to current node.
else {
node.has_data = true;
for i in 0..Z.min(data.len()) {
node.data[i] = data[i];
}
}
self.write_node(node_index, &node)?;
self.write_node(child_index, &child_node)?;
}
}
// If node does not share prefix with key...
else {
// Move to or create new next node.
if node.next != 0 {
//println!(" - next");
// Move to next node.
node_index = node.next;
} else {
//println!(" - new next");
// Allocate and initialize subsequent nodes until key is resolved.
node.next = self.trailing_nodes(key, &mut key_index, data)?;
self.write_node(node_index, &node)?;
}
}
}
}
Ok(())
}
pub fn get(&self, key:&[u8]) -> Result<Option<Vec<u8>>, std::io::Error>
{
let mut node = Node::<Z>::new();
let mut node_index = 0;
let mut key_index = 0;
if self.block_count()? != 0 {
// Traverse nodes until key is found.
while key_index < key.len() {
//println!("start k {}/{}", key_index, key.len());
self.read_node(node_index, &mut node)?;
// If node shares prefix with key...
if node.bytes[0] == key[key_index] {
// Count length of shared prefix
let mut prefix_index = 0;
while prefix_index < node.length
&& (key_index + prefix_index) < key.len()
&& key[key_index + prefix_index] == node.bytes[prefix_index] {
prefix_index += 1;
}
//println!("prefix {}", prefix_index);
if prefix_index == node.length {
key_index += prefix_index;
// Return node contents.
if key_index == key.len() {
//println!(" - found node");
return Ok(Some(node.data.to_vec()));
}
// Continue to child node.
else {
if node.child != 0 {
//println!(" - child");
node_index = node.child;
} else {
return Ok(None);
}
}
} else {
return Ok(None);
}
}
// If node does not share prefix with key...
else {
// Move to or create new next node.
if node.next != 0 {
//println!(" - next");
// Move to next node.
node_index = node.next;
} else {
return Ok(None);
}
}
}
}
Ok(None)
}
/*pub fn unset(&self, _key:&[u8]) -> Result<(), std::io::Error>
{
Ok(())
}*/
fn trailing_nodes(&mut self, key:&[u8], key_index:&mut usize, data:&[u8]) -> Result<u32, std::io::Error>
{
let starting_node = self.allocate()?;
let mut node :Node<Z>;
let mut node_index = starting_node;
while *key_index < key.len() {
node = Node::<Z>::new();
// Copy key bytes to node.
for i in 0..15.min(key.len() - *key_index) {
node.bytes[i] = key[*key_index];
node.length += 1;
*key_index += 1;
}
// Allocate child node if key byte remain.
if *key_index < key.len() {
node.child = self.allocate()?;
}
// Otherwise, write data to node.
else {
node.has_data = true;
for i in 0..Z.min(data.len()) {
node.data[i] = data[i];
}
}
self.write_node(node_index, &node)?;
node_index = node.child;
}
Ok(starting_node)
}
fn read_node(&self, index:u32, node:&mut Node<Z>) -> Result<(), std::io::Error>
{
let mut file = self.file.try_clone()?;
let mut data = vec![0u8; Self::block_size()];
file.seek(SeekFrom::Start((Self::block_size() * index as usize) as u64))?;
file.read_exact(&mut data)?;
node.decode(&data, &mut 0).map_err(|_| std::io::Error::other("failed to decode node block"))?;
Ok(())
}
fn write_node(&mut self, index:u32, node:&Node<Z>) -> Result<(), std::io::Error>
{
self.file.seek(SeekFrom::Start((Self::block_size() * index as usize) as u64))?;
self.file.write(&node.encode())?;
Ok(())
}
fn allocate(&mut self) -> Result<u32, std::io::Error>
{
let block_id = ((self.file.seek(SeekFrom::End(0))? as usize) / Self::block_size()) as u32;
self.file.write(&vec![0u8; Self::block_size()])?;
Ok(block_id)
}
fn block_count(&self) -> Result<u32, std::io::Error>
{
let mut file = self.file.try_clone()?;
Ok(((file.seek(SeekFrom::End(0))? as usize) / Self::block_size()) as u32)
}
const fn block_size() -> usize
{
Node::<Z>::len()
}
}