From 52cb1d8ed9ec7ebe06cc3dbf76fe6a2cf032063b Mon Sep 17 00:00:00 2001 From: wires Date: Wed, 12 Mar 2025 10:34:23 -0400 Subject: [PATCH] re-add source location info --- src/akh_lexer.erl | 102 ++++++++++++++++++++++++++++------------- src/akh_source_map.erl | 42 +++++++++++++++++ src/akhamoth.erl | 9 ++-- 3 files changed, 116 insertions(+), 37 deletions(-) create mode 100644 src/akh_source_map.erl diff --git a/src/akh_lexer.erl b/src/akh_lexer.erl index 7385b9f..9128292 100644 --- a/src/akh_lexer.erl +++ b/src/akh_lexer.erl @@ -3,62 +3,98 @@ This module contains functions for tokenizing Akhamoth source code. """. --export([new/1, next/1]). +-export([ + new/1, + source_map/1, + next/1 +]). + +-export_type([lexer/0]). -define(is_digit(C), C >= $0, C =< $9). -define(is_id_start(C), C >= $a, C =< $z; C >= $A, C =< $Z; C =:= $_). +-define(is_space(C), C =:= $\s; C =:= $\t). -doc """ -A token in the input stream +Tokens for which the category is the same as the content. """. --type token() :: any(). +-type token_simple() :: '(' | ')' | '[' | ']' | '{' | '}'. + +-doc """ +Tokens for which there is content beyond the category. +""". +-type token_complex() :: id | number | unknown. + +-doc """ +A token in the input stream. +""". +-type token() :: + {token_simple(), Position :: non_neg_integer() | inserted} + | {token_complex(), Position :: non_neg_integer(), Length :: pos_integer()}. -record(lexer, { source :: binary(), - offset = 0 :: non_neg_integer() + offset = 0 :: non_neg_integer(), + source_map :: akh_source_map:source_map() }). +-opaque lexer() :: #lexer{}. + +-type return() :: none | {ok, token(), lexer()}. + %%% exports -doc """ Initializes a lexer to tokenize the given binary. """. --spec new(binary()) -> #lexer{}. -new(Source) -> #lexer{source = Source}. +-spec new(binary()) -> lexer(). +new(Source) -> #lexer{source = Source, source_map = akh_source_map:new()}. -doc """ +Returns the source map for a lexer. +""". +-spec source_map(lexer()) -> akh_source_map:source_map(). +source_map(#lexer{source_map = SourceMap}) -> SourceMap. + +-doc """ +next(Lexer) + Attempts to get the next token in the input. """. --spec next(#lexer{}) -> none | {ok, token(), #lexer{}}. -next(#lexer{source = <>} = Lx) when ?is_id_start(C) -> - lex_id(Lx, 1); -next(#lexer{source = <>} = Lx) when ?is_digit(C) -> - lex_number(Lx, 1, C - $0); +-spec next(lexer()) -> none | {ok, token(), lexer()}. +next(#lexer{source = <>} = Lx) when ?is_id_start(C) -> + lex_id(Lx#lexer{source = Rest}, 1); +next(#lexer{source = <>} = Lx) when ?is_digit(C) -> + lex_number(Lx#lexer{source = Rest}, 1); +next(#lexer{source = <>, offset = Offset} = Lx) when ?is_space(C) -> + next(Lx#lexer{source = Rest, offset = Offset + 1}); +next(#lexer{source = <<$\n, Rest/bytes>>, offset = Offset} = Lx) -> + new_line(Lx#lexer{source = Rest, offset = Offset + 1}); +next(#lexer{source = <<$\r, $\n, Rest/bytes>>, offset = Offset} = Lx) -> + new_line(Lx#lexer{source = Rest, offset = Offset + 2}); next(#lexer{source = <<>>}) -> none. %%% local functions -lex_id(#lexer{source = Source, offset = Offset} = Lx, Len) when Len < byte_size(Source) -> - C = binary:at(Source, Len), - if - ?is_id_start(C); ?is_digit(C) -> - lex_id(Lx, Len + 1); - true -> - {Id, Rest} = split_binary(Source, Len), - {ok, {id, Offset, Id}, Lx#lexer{source = Rest, offset = Offset + Len}} - end; -lex_id(#lexer{source = Source, offset = Offset} = Lx, Len) -> - {ok, {id, Offset, Source}, Lx#lexer{source = <<>>, offset = Offset + Len}}. +-spec lex_id(#lexer{}, non_neg_integer()) -> return(). +lex_id( + #lexer{source = <>} = Lx, + Len +) when ?is_id_start(C); ?is_digit(C) -> + lex_id(Lx#lexer{source = Rest}, Len + 1); +lex_id(#lexer{offset = Offset} = Lx, Len) -> + {ok, {id, Offset, Len}, Lx#lexer{offset = Offset + Len}}. -lex_number(#lexer{source = Source, offset = Offset} = Lx, Len, Acc) when Len < byte_size(Source) -> - C = binary:at(Source, Len), - if - ?is_digit(C) -> - lex_number(Lx, Len + 1, Acc * 10 + C - $0); - true -> - {_, Rest} = split_binary(Source, Len), - {ok, {number, Offset, Acc}, Lx#lexer{source = Rest, offset = Offset + Len}} - end; -lex_number(#lexer{offset = Offset} = Lx, Len, Acc) -> - {ok, {number, Offset, Acc}, Lx#lexer{source = <<>>, offset = Offset + Len}}. +-spec lex_number(#lexer{}, non_neg_integer()) -> return(). +lex_number( + #lexer{source = <>} = Lx, + Len +) when ?is_digit(C); C =:= $_ -> + lex_number(Lx#lexer{source = Rest}, Len + 1); +lex_number(#lexer{offset = Offset} = Lx, Len) -> + {ok, {number, Offset, Len}, Lx#lexer{offset = Offset + Len}}. + +-spec new_line(#lexer{}) -> return(). +new_line(#lexer{source_map = SourceMap} = Lx) -> + next(Lx#lexer{source_map = akh_source_map:insert(Lx#lexer.offset, SourceMap)}). diff --git a/src/akh_source_map.erl b/src/akh_source_map.erl new file mode 100644 index 0000000..661bab3 --- /dev/null +++ b/src/akh_source_map.erl @@ -0,0 +1,42 @@ +-module(akh_source_map). +-moduledoc """ +The source map translates the byte offsets returned by the lexer into lines and +columns for display. Internally, it uses the balanced binary tree from +`m:gb_trees` to avoid linear lookup times. +""". + +-export([ + new/0, + insert/2, + location/2 +]). + +-export_type([source_map/0]). + +-opaque source_map() :: {gb_trees:tree(non_neg_integer(), pos_integer()), Line :: pos_integer()}. + +-type location() :: {Line :: pos_integer(), Column :: pos_integer()}. + +-doc """ +Returns a new source map. +""". +-spec new() -> source_map(). +new() -> {gb_trees:empty(), 1}. + +-doc """ +Insert the next line break at byte offset `Offset`. +""". +-spec insert(Offset :: non_neg_integer(), SourceMap :: source_map()) -> source_map(). +insert(Offset, {Tree, Line}) -> {gb_trees:insert(Offset - 1, Line + 1, Tree), Line + 1}. + +-doc """ +Get line and column info for byte offset `Offset`. +""". +-spec location(Offset :: non_neg_integer(), SourceMap :: source_map()) -> location(). +location(Offset, {Tree, _}) -> + case gb_trees:smaller(Offset, Tree) of + {Start, Line} -> + {Line, Offset - Start}; + none -> + {1, Offset} + end. diff --git a/src/akhamoth.erl b/src/akhamoth.erl index 53e946e..57caf83 100644 --- a/src/akhamoth.erl +++ b/src/akhamoth.erl @@ -7,11 +7,12 @@ compile_file(Path) -> compile_binary(Source). compile_binary(Source) -> - Lx = akh_lexer:new(Source), - collect(Lx, []). + Lx = akh_lex:new(Source), + {Tokens, SourceMap} = collect(Lx, []), + lists:map(fun({T, O, _}) -> {T, akh_source_map:location(O, SourceMap)} end, Tokens). collect(Lx, Acc) -> - case akh_lexer:next(Lx) of - none -> Acc; + case akh_lex:next(Lx) of + none -> {Acc, akh_lex:source_map(Lx)}; {ok, T, L} -> collect(L, [T | Acc]) end.