From b01b560d316d69914691b710f958c4098a1f3ff1 Mon Sep 17 00:00:00 2001 From: wires Date: Fri, 11 Jul 2025 11:21:14 -0400 Subject: [PATCH] parser round 1 --- Cargo.lock | 37 +++++++++ Cargo.toml | 4 + src/main.rs | 58 ++++++++++---- src/parsing.rs | 179 ++++++++++++++++++++++++++++++++++++++++++ src/parsing/cursor.rs | 50 ++++++++++++ src/parsing/symbol.rs | 62 +++++++++++++++ 6 files changed, 376 insertions(+), 14 deletions(-) create mode 100644 src/parsing.rs create mode 100644 src/parsing/cursor.rs create mode 100644 src/parsing/symbol.rs diff --git a/Cargo.lock b/Cargo.lock index 4ef4ed1..8bf0af1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "errno" version = "0.3.13" @@ -68,6 +74,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "hashbrown" +version = "0.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" + [[package]] name = "home" version = "0.5.11" @@ -77,6 +89,16 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "indexmap" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "libc" version = "0.2.174" @@ -185,6 +207,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "smallvec" version = "1.15.1" @@ -222,6 +250,12 @@ dependencies = [ "syn", ] +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + [[package]] name = "unicode-ident" version = "1.0.18" @@ -397,6 +431,9 @@ name = "wires_lisp" version = "0.1.0" dependencies = [ "anyhow", + "indexmap", "rustyline", + "scoped-tls", "thiserror", + "typed-arena", ] diff --git a/Cargo.toml b/Cargo.toml index 3c2a510..87641e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,4 +6,8 @@ license = "MIT" [dependencies] anyhow = "1.0.98" +indexmap = "2.10.0" rustyline = "16.0.0" +scoped-tls = "1.0.1" +thiserror = "2.0.12" +typed-arena = "2.0.2" diff --git a/src/main.rs b/src/main.rs index 944b4bd..ef2bf69 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,19 +1,49 @@ -use rustyline::error::ReadlineError; +use std::cell::RefCell; -fn main() -> anyhow::Result<()> { - let mut rl = rustyline::DefaultEditor::new()?; - loop { - match rl.readline("> ") { - Ok(line) => { - eval(&line); - rl.add_history_entry(line)?; - } - Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => break Ok(()), - Err(e) => break Err(e.into()), - } +use rustyline::error::ReadlineError; +use scoped_tls::scoped_thread_local; + +use crate::parsing::{Interner, parse}; + +mod parsing; + +#[derive(Default)] +struct Session { + interner: RefCell, +} + +scoped_thread_local!(static SESSION: Session); + +fn with_session(f: impl FnOnce(&Session) -> R) -> R { + SESSION.with(f) +} + +fn create_session_then(f: impl FnOnce() -> R) -> R { + assert!(!SESSION.is_set()); + let session = Default::default(); + SESSION.set(&session, f) +} + +fn eval(line: &str) { + for expr in parse(line) { + println!("{expr:?}"); } } -fn eval(input: &str) { - println!("{input}"); +fn main() -> anyhow::Result<()> { + create_session_then(|| { + let mut rl = rustyline::DefaultEditor::new()?; + + loop { + match rl.readline("> ") { + Ok(line) => { + rl.add_history_entry(line.clone())?; + eval(&line); + } + Err(ReadlineError::Interrupted) => (), + Err(ReadlineError::Eof) => break Ok(()), + Err(e) => break Err(e.into()), + } + } + }) } diff --git a/src/parsing.rs b/src/parsing.rs new file mode 100644 index 0000000..0ab16f9 --- /dev/null +++ b/src/parsing.rs @@ -0,0 +1,179 @@ +mod cursor; +mod symbol; + +use std::num::ParseIntError; + +pub use cursor::Cursor; +pub use symbol::Interner; + +use symbol::Symbol; + +fn is_atom_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '*' | '/' | '=' | '<' | '>') +} + +fn is_atom_continue(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '*' | '/' | '=' | '<' | '>' | '-' | '+') +} + +#[derive(Debug)] +enum TokenKind { + OpenParen, + CloseParen, + Atom, + Number, +} + +#[derive(Debug)] +pub struct Span { + start: usize, + len: usize, +} + +#[derive(Debug)] +struct Token { + kind: TokenKind, + span: Span, +} + +impl<'a> Cursor<'a> { + fn next_token(&mut self) -> Option { + self.reset_span(); + + let kind = match self.bump()? { + c if c.is_whitespace() => { + self.eat_while(|c| c.is_whitespace()); + return self.next_token(); + } + c if is_atom_start(c) => { + self.eat_while(is_atom_continue); + TokenKind::Atom + } + '0'..='9' => self.number_or_atom(), + '-' | '+' => { + if self.peek().is_ascii_digit() { + self.number_or_atom() + } else { + self.eat_while(is_atom_continue); + TokenKind::Atom + } + } + '(' => TokenKind::OpenParen, + ')' => TokenKind::CloseParen, + _ => todo!(), + }; + + Some(Token { + kind, + span: self.cur_span(), + }) + } + + fn number_or_atom(&mut self) -> TokenKind { + self.eat_while(|c| c.is_ascii_digit()); + if is_atom_continue(self.peek()) { + self.eat_while(is_atom_continue); + TokenKind::Atom + } else { + TokenKind::Number + } + } +} + +#[derive(Debug)] +pub enum Expr { + Atom(Symbol), + Number(i32), + List(Vec), +} + +impl Expr { + fn atom(string: &str) -> Self { + Self::Atom(Symbol::new(string)) + } + + fn parse_int(string: &str) -> Result { + string.parse::().map(Self::Number) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("incomplete form")] + Incomplete, + #[error("Unexpected token")] + Unexpected, + #[error(transparent)] + ParseInt(#[from] ParseIntError), +} + +pub struct ParseIter<'a> { + src: &'a str, + cursor: Cursor<'a>, +} + +impl<'a> ParseIter<'a> { + fn new(src: &'a str) -> Self { + Self { + src, + cursor: Cursor::new(src), + } + } + + fn parse_list(&mut self) -> Result { + let mut res = vec![]; + + while let Some(term) = self.parse_list_helper() { + res.push(term?); + } + + Ok(Expr::List(res)) + } + + fn parse_list_helper(&mut self) -> Option> { + let Self { src, cursor } = self; + let Token { + kind, + span: Span { start, len }, + } = match cursor.next_token() { + None => return Some(Err(Error::Incomplete)), + Some(t) => t, + }; + let end = start + len; + + let src_str = &src[start..end]; + + match kind { + TokenKind::Atom => Some(Ok(Expr::atom(src_str))), + TokenKind::Number => Some(Expr::parse_int(src_str).map_err(Into::into)), + TokenKind::OpenParen => Some(self.parse_list()), + TokenKind::CloseParen => None, + } + } +} + +impl<'a> Iterator for ParseIter<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + let Self { src, cursor } = self; + let Token { + kind, + span: Span { start, len }, + } = cursor.next_token()?; + let end = start + len; + + let src_str = &src[start..end]; + + Some(match kind { + TokenKind::Atom => Ok(Expr::atom(src_str)), + TokenKind::Number => Expr::parse_int(src_str).map_err(Into::into), + TokenKind::OpenParen => self.parse_list(), + _ => Err(Error::Unexpected), + }) + } +} + +pub fn parse<'a>(input: &'a str) -> ParseIter<'a> { + ParseIter::new(input) +} diff --git a/src/parsing/cursor.rs b/src/parsing/cursor.rs new file mode 100644 index 0000000..3b6e2bb --- /dev/null +++ b/src/parsing/cursor.rs @@ -0,0 +1,50 @@ +use std::str::Chars; + +use super::Span; + +pub struct Cursor<'a> { + chars: Chars<'a>, + start: usize, + len: usize, +} + +impl<'a> Cursor<'a> { + pub fn new(input: &'a str) -> Self { + Self { + chars: input.chars(), + start: 0, + len: 0, + } + } + + pub fn bump(&mut self) -> Option { + self.len += 1; + self.chars.next() + } + + pub fn peek(&self) -> char { + self.chars.clone().next().unwrap_or('\0') + } + + pub fn is_empty(&self) -> bool { + self.chars.as_str().is_empty() + } + + pub fn eat_while(&mut self, pred: impl Fn(char) -> bool) { + while pred(self.peek()) && !self.is_empty() { + self.bump(); + } + } + + pub fn cur_span(&self) -> Span { + Span { + start: self.start, + len: self.len, + } + } + + pub fn reset_span(&mut self) { + self.start += self.len; + self.len = 0; + } +} diff --git a/src/parsing/symbol.rs b/src/parsing/symbol.rs new file mode 100644 index 0000000..8ba9ff6 --- /dev/null +++ b/src/parsing/symbol.rs @@ -0,0 +1,62 @@ +use std::fmt::{self, Formatter}; + +use indexmap::IndexSet; +use typed_arena::Arena; + +use crate::with_session; + +#[derive(Default)] +pub struct Interner { + arena: Arena, + strings: IndexSet<&'static str>, +} + +impl Interner { + fn intern(&mut self, string: &str) -> Symbol { + if let Some(idx) = self.strings.get_index_of(string) { + return Symbol(idx as u32); + } + + let string = self.arena.alloc_str(string); + + // spooky + let string: &'static str = unsafe { &*(string as *const str) }; + + let (i, new) = self.strings.insert_full(string); + debug_assert!(new); + + Symbol(i as u32) + } + + fn get(&self, symbol: Symbol) -> Option<&str> { + self.strings.get_index(symbol.0 as usize).copied() + } +} + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Symbol(u32); + +impl fmt::Debug for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(self.as_str(), f) + } +} + +impl fmt::Display for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self.as_str(), f) + } +} + +impl Symbol { + pub fn new(string: &str) -> Self { + with_session(|session| session.interner.borrow_mut().intern(string)) + } + + fn as_str(&self) -> &str { + // again, spooky + with_session(|session| unsafe { + std::mem::transmute::<&str, &str>(session.interner.borrow().get(*self).unwrap()) + }) + } +}