re-add source location info

This commit is contained in:
wires 2025-03-12 10:34:23 -04:00
parent 1c5e487e61
commit 52cb1d8ed9
Signed by: wires
SSH key fingerprint: SHA256:9GtP+M3O2IivPDlw1UY872UPUuJH2gI0yG6ExBxaaiM
3 changed files with 116 additions and 37 deletions

View file

@ -3,62 +3,98 @@
This module contains functions for tokenizing Akhamoth source code. This module contains functions for tokenizing Akhamoth source code.
""". """.
-export([new/1, next/1]). -export([
new/1,
source_map/1,
next/1
]).
-export_type([lexer/0]).
-define(is_digit(C), C >= $0, C =< $9). -define(is_digit(C), C >= $0, C =< $9).
-define(is_id_start(C), C >= $a, C =< $z; C >= $A, C =< $Z; C =:= $_). -define(is_id_start(C), C >= $a, C =< $z; C >= $A, C =< $Z; C =:= $_).
-define(is_space(C), C =:= $\s; C =:= $\t).
-doc """ -doc """
A token in the input stream Tokens for which the category is the same as the content.
""". """.
-type token() :: any(). -type token_simple() :: '(' | ')' | '[' | ']' | '{' | '}'.
-doc """
Tokens for which there is content beyond the category.
""".
-type token_complex() :: id | number | unknown.
-doc """
A token in the input stream.
""".
-type token() ::
{token_simple(), Position :: non_neg_integer() | inserted}
| {token_complex(), Position :: non_neg_integer(), Length :: pos_integer()}.
-record(lexer, { -record(lexer, {
source :: binary(), source :: binary(),
offset = 0 :: non_neg_integer() offset = 0 :: non_neg_integer(),
source_map :: akh_source_map:source_map()
}). }).
-opaque lexer() :: #lexer{}.
-type return() :: none | {ok, token(), lexer()}.
%%% exports %%% exports
-doc """ -doc """
Initializes a lexer to tokenize the given binary. Initializes a lexer to tokenize the given binary.
""". """.
-spec new(binary()) -> #lexer{}. -spec new(binary()) -> lexer().
new(Source) -> #lexer{source = Source}. new(Source) -> #lexer{source = Source, source_map = akh_source_map:new()}.
-doc """ -doc """
Returns the source map for a lexer.
""".
-spec source_map(lexer()) -> akh_source_map:source_map().
source_map(#lexer{source_map = SourceMap}) -> SourceMap.
-doc """
next(Lexer)
Attempts to get the next token in the input. Attempts to get the next token in the input.
""". """.
-spec next(#lexer{}) -> none | {ok, token(), #lexer{}}. -spec next(lexer()) -> none | {ok, token(), lexer()}.
next(#lexer{source = <<C, _/bytes>>} = Lx) when ?is_id_start(C) -> next(#lexer{source = <<C, Rest/bytes>>} = Lx) when ?is_id_start(C) ->
lex_id(Lx, 1); lex_id(Lx#lexer{source = Rest}, 1);
next(#lexer{source = <<C, _/bytes>>} = Lx) when ?is_digit(C) -> next(#lexer{source = <<C, Rest/bytes>>} = Lx) when ?is_digit(C) ->
lex_number(Lx, 1, C - $0); lex_number(Lx#lexer{source = Rest}, 1);
next(#lexer{source = <<C, Rest/bytes>>, offset = Offset} = Lx) when ?is_space(C) ->
next(Lx#lexer{source = Rest, offset = Offset + 1});
next(#lexer{source = <<$\n, Rest/bytes>>, offset = Offset} = Lx) ->
new_line(Lx#lexer{source = Rest, offset = Offset + 1});
next(#lexer{source = <<$\r, $\n, Rest/bytes>>, offset = Offset} = Lx) ->
new_line(Lx#lexer{source = Rest, offset = Offset + 2});
next(#lexer{source = <<>>}) -> next(#lexer{source = <<>>}) ->
none. none.
%%% local functions %%% local functions
lex_id(#lexer{source = Source, offset = Offset} = Lx, Len) when Len < byte_size(Source) -> -spec lex_id(#lexer{}, non_neg_integer()) -> return().
C = binary:at(Source, Len), lex_id(
if #lexer{source = <<C, Rest/bytes>>} = Lx,
?is_id_start(C); ?is_digit(C) -> Len
lex_id(Lx, Len + 1); ) when ?is_id_start(C); ?is_digit(C) ->
true -> lex_id(Lx#lexer{source = Rest}, Len + 1);
{Id, Rest} = split_binary(Source, Len), lex_id(#lexer{offset = Offset} = Lx, Len) ->
{ok, {id, Offset, Id}, Lx#lexer{source = Rest, offset = Offset + Len}} {ok, {id, Offset, Len}, Lx#lexer{offset = Offset + Len}}.
end;
lex_id(#lexer{source = Source, offset = Offset} = Lx, Len) ->
{ok, {id, Offset, Source}, Lx#lexer{source = <<>>, offset = Offset + Len}}.
lex_number(#lexer{source = Source, offset = Offset} = Lx, Len, Acc) when Len < byte_size(Source) -> -spec lex_number(#lexer{}, non_neg_integer()) -> return().
C = binary:at(Source, Len), lex_number(
if #lexer{source = <<C, Rest/bytes>>} = Lx,
?is_digit(C) -> Len
lex_number(Lx, Len + 1, Acc * 10 + C - $0); ) when ?is_digit(C); C =:= $_ ->
true -> lex_number(Lx#lexer{source = Rest}, Len + 1);
{_, Rest} = split_binary(Source, Len), lex_number(#lexer{offset = Offset} = Lx, Len) ->
{ok, {number, Offset, Acc}, Lx#lexer{source = Rest, offset = Offset + Len}} {ok, {number, Offset, Len}, Lx#lexer{offset = Offset + Len}}.
end;
lex_number(#lexer{offset = Offset} = Lx, Len, Acc) -> -spec new_line(#lexer{}) -> return().
{ok, {number, Offset, Acc}, Lx#lexer{source = <<>>, offset = Offset + Len}}. new_line(#lexer{source_map = SourceMap} = Lx) ->
next(Lx#lexer{source_map = akh_source_map:insert(Lx#lexer.offset, SourceMap)}).

42
src/akh_source_map.erl Normal file
View file

@ -0,0 +1,42 @@
-module(akh_source_map).
-moduledoc """
The source map translates the byte offsets returned by the lexer into lines and
columns for display. Internally, it uses the balanced binary tree from
`m:gb_trees` to avoid linear lookup times.
""".
-export([
new/0,
insert/2,
location/2
]).
-export_type([source_map/0]).
-opaque source_map() :: {gb_trees:tree(non_neg_integer(), pos_integer()), Line :: pos_integer()}.
-type location() :: {Line :: pos_integer(), Column :: pos_integer()}.
-doc """
Returns a new source map.
""".
-spec new() -> source_map().
new() -> {gb_trees:empty(), 1}.
-doc """
Insert the next line break at byte offset `Offset`.
""".
-spec insert(Offset :: non_neg_integer(), SourceMap :: source_map()) -> source_map().
insert(Offset, {Tree, Line}) -> {gb_trees:insert(Offset - 1, Line + 1, Tree), Line + 1}.
-doc """
Get line and column info for byte offset `Offset`.
""".
-spec location(Offset :: non_neg_integer(), SourceMap :: source_map()) -> location().
location(Offset, {Tree, _}) ->
case gb_trees:smaller(Offset, Tree) of
{Start, Line} ->
{Line, Offset - Start};
none ->
{1, Offset}
end.

View file

@ -7,11 +7,12 @@ compile_file(Path) ->
compile_binary(Source). compile_binary(Source).
compile_binary(Source) -> compile_binary(Source) ->
Lx = akh_lexer:new(Source), Lx = akh_lex:new(Source),
collect(Lx, []). {Tokens, SourceMap} = collect(Lx, []),
lists:map(fun({T, O, _}) -> {T, akh_source_map:location(O, SourceMap)} end, Tokens).
collect(Lx, Acc) -> collect(Lx, Acc) ->
case akh_lexer:next(Lx) of case akh_lex:next(Lx) of
none -> Acc; none -> {Acc, akh_lex:source_map(Lx)};
{ok, T, L} -> collect(L, [T | Acc]) {ok, T, L} -> collect(L, [T | Acc])
end. end.