diff options
Diffstat (limited to 'lexer')
| -rw-r--r-- | lexer/build.gradle.kts | 19 | ||||
| -rw-r--r-- | lexer/src/main/java/lv/enes/orang/lexer/Lexer.java | 197 | ||||
| -rw-r--r-- | lexer/src/main/java/lv/enes/orang/lexer/Token.java | 62 | ||||
| -rw-r--r-- | lexer/src/main/java/module-info.java | 5 |
4 files changed, 283 insertions, 0 deletions
diff --git a/lexer/build.gradle.kts b/lexer/build.gradle.kts new file mode 100644 index 0000000..7fe8777 --- /dev/null +++ b/lexer/build.gradle.kts | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | plugins { | ||
| 2 | java | ||
| 3 | } | ||
| 4 | |||
| 5 | dependencies { | ||
| 6 | implementation(project(":utils")) | ||
| 7 | } | ||
| 8 | |||
| 9 | java { | ||
| 10 | sourceCompatibility = JavaVersion.VERSION_22 | ||
| 11 | targetCompatibility = JavaVersion.VERSION_22 | ||
| 12 | toolchain { | ||
| 13 | languageVersion = JavaLanguageVersion.of(22) | ||
| 14 | } | ||
| 15 | } | ||
| 16 | |||
| 17 | tasks.withType<JavaCompile> { | ||
| 18 | options.compilerArgs.add("--enable-preview") | ||
| 19 | } \ No newline at end of file | ||
diff --git a/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java b/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java new file mode 100644 index 0000000..8fec98e --- /dev/null +++ b/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java | |||
| @@ -0,0 +1,197 @@ | |||
| 1 | package lv.enes.orang.lexer; | ||
| 2 | |||
| 3 | import lv.enes.orang.utils.Codepoint; | ||
| 4 | import lv.enes.orang.utils.PeekableStream; | ||
| 5 | |||
| 6 | import java.io.*; | ||
| 7 | import java.util.Iterator; | ||
| 8 | import java.util.function.BiFunction; | ||
| 9 | import java.util.function.Predicate; | ||
| 10 | import java.util.stream.IntStream; | ||
| 11 | import java.util.stream.Stream; | ||
| 12 | |||
| 13 | public class Lexer implements Iterator<Token> { | ||
| 14 | public static boolean isIdentInitial(Codepoint cp) { | ||
| 15 | return Character.isLetter(cp.cp()) || cp.cp() == '_'; | ||
| 16 | } | ||
| 17 | |||
| 18 | public static boolean isIdentFinal(Codepoint cp) { | ||
| 19 | return isIdentInitial(cp) || Character.isDigit(cp.cp()); | ||
| 20 | } | ||
| 21 | |||
| 22 | public static boolean isNewline(Codepoint cp) { | ||
| 23 | return cp.cp() == '\n'; | ||
| 24 | } | ||
| 25 | |||
| 26 | public static boolean isNumeral(Codepoint cp) { | ||
| 27 | return Character.isDigit(cp.cp()); | ||
| 28 | } | ||
| 29 | |||
| 30 | public static boolean isWhitespace(Codepoint cp) { | ||
| 31 | return Character.isWhitespace(cp.cp()); | ||
| 32 | } | ||
| 33 | |||
| 34 | private final PeekableStream<Codepoint> input; | ||
| 35 | |||
| 36 | public Lexer(InputStream input) { | ||
| 37 | this(new InputStreamReader(input)); | ||
| 38 | } | ||
| 39 | |||
| 40 | public Lexer(Reader input) { | ||
| 41 | var cpStream = new BufferedReader(input) | ||
| 42 | .lines() | ||
| 43 | .flatMapToInt(str -> IntStream.concat(str.codePoints(), IntStream.of('\n'))) | ||
| 44 | .mapToObj(Codepoint::new); | ||
| 45 | var theEof = Stream.of(new Codepoint(-1)); | ||
| 46 | this.input = new PeekableStream<>(Stream.concat(cpStream, theEof).iterator()); | ||
| 47 | } | ||
| 48 | |||
| 49 | public Lexer(String input) { | ||
| 50 | this(new StringReader(input)); | ||
| 51 | } | ||
| 52 | |||
| 53 | private boolean hasNext = true; | ||
| 54 | |||
| 55 | @Override | ||
| 56 | public Token next() { | ||
| 57 | var tok = nextToken(); | ||
| 58 | if (tok.type() == Token.Type.EOF) { | ||
| 59 | hasNext = false; | ||
| 60 | } | ||
| 61 | return tok; | ||
| 62 | } | ||
| 63 | |||
| 64 | @Override | ||
| 65 | public boolean hasNext() { | ||
| 66 | return hasNext; | ||
| 67 | } | ||
| 68 | |||
| 69 | private Token nextToken() { | ||
| 70 | skipWhitespace(); | ||
| 71 | return switch (input.peek().cp()) { | ||
| 72 | case -1 -> new Token(Token.Type.EOF, ""); | ||
| 73 | |||
| 74 | case '*' -> new Token(Token.Type.ASTERISK, input.next()); | ||
| 75 | case '!' -> new Token(Token.Type.BANG, input.next()); | ||
| 76 | case '[' -> new Token(Token.Type.BRACKET_LEFT, input.next()); | ||
| 77 | case ']' -> new Token(Token.Type.BRACKET_RIGHT, input.next()); | ||
| 78 | case ',' -> new Token(Token.Type.COMMA, input.next()); | ||
| 79 | case '=' -> new Token(Token.Type.EQUAL, input.next()); | ||
| 80 | case '>' -> { | ||
| 81 | var first = input.next(); | ||
| 82 | if (input.peek().cp() == '=') { | ||
| 83 | yield new Token(Token.Type.GREATER_EQUAL, first, input.next()); | ||
| 84 | } else { | ||
| 85 | yield new Token(Token.Type.GREATER, first); | ||
| 86 | } | ||
| 87 | } | ||
| 88 | case '<' -> { | ||
| 89 | var first = input.next(); | ||
| 90 | if (input.peek().cp() == '=') { | ||
| 91 | yield new Token(Token.Type.LESS_EQUAL, first, input.next()); | ||
| 92 | } else { | ||
| 93 | yield new Token(Token.Type.LESS, first); | ||
| 94 | } | ||
| 95 | } | ||
| 96 | case '-' -> { | ||
| 97 | var first = input.next(); | ||
| 98 | if (input.peek().cp() == '>') { | ||
| 99 | yield new Token(Token.Type.MINUS_GREATER, first, input.next()); | ||
| 100 | } else { | ||
| 101 | yield new Token(Token.Type.MINUS, first); | ||
| 102 | } | ||
| 103 | } | ||
| 104 | case '(' -> new Token(Token.Type.PAREN_LEFT, input.next()); | ||
| 105 | case ')' -> new Token(Token.Type.PAREN_RIGHT, input.next()); | ||
| 106 | case '+' -> new Token(Token.Type.PLUS, input.next()); | ||
| 107 | case '?' -> { | ||
| 108 | var first = input.next(); | ||
| 109 | if (input.peek().cp() == '=') { | ||
| 110 | yield new Token(Token.Type.QUESTION_EQUAL, first, input.next()); | ||
| 111 | } else { | ||
| 112 | yield new Token(Token.Type.ILLEGAL, first, input.next()); | ||
| 113 | } | ||
| 114 | } | ||
| 115 | case ';' -> new Token(Token.Type.SEMICOLON, input.next()); | ||
| 116 | case '/' -> { | ||
| 117 | var first = input.next(); | ||
| 118 | if (input.peek().cp() == '=') { | ||
| 119 | yield new Token(Token.Type.SLASH_EQUAL, first, input.next()); | ||
| 120 | } else { | ||
| 121 | yield new Token(Token.Type.SLASH, first); | ||
| 122 | } | ||
| 123 | } | ||
| 124 | |||
| 125 | case '"' -> new Token(Token.Type.STRING, readString()); | ||
| 126 | |||
| 127 | default -> { | ||
| 128 | if (isIdentInitial(input.peek())) { | ||
| 129 | var ident = readIdentifier(); | ||
| 130 | var type = switch (ident) { | ||
| 131 | case "and" -> Token.Type.AND; | ||
| 132 | case "def" -> Token.Type.DEF; | ||
| 133 | case "do" -> Token.Type.DO; | ||
| 134 | case "else" -> Token.Type.ELSE; | ||
| 135 | case "end" -> Token.Type.END; | ||
| 136 | case "false" -> Token.Type.FALSE; | ||
| 137 | case "fn" -> Token.Type.FN; | ||
| 138 | case "if" -> Token.Type.IF; | ||
| 139 | case "in" -> Token.Type.IN; | ||
| 140 | case "let" -> Token.Type.LET; | ||
| 141 | case "then" -> Token.Type.THEN; | ||
| 142 | case "true" -> Token.Type.TRUE; | ||
| 143 | default -> Token.Type.IDENTIFIER; | ||
| 144 | }; | ||
| 145 | yield new Token(type, ident); | ||
| 146 | } else if (isNumeral(input.peek())) { | ||
| 147 | yield new Token(Token.Type.INTEGER, readInteger()); | ||
| 148 | } else { | ||
| 149 | yield new Token(Token.Type.ILLEGAL, input.next()); | ||
| 150 | } | ||
| 151 | } | ||
| 152 | }; | ||
| 153 | } | ||
| 154 | |||
| 155 | private <T> T foldWhile(Predicate<Codepoint> pred, T initial, BiFunction<T, Codepoint, T> combine) { | ||
| 156 | var res = initial; | ||
| 157 | var ch = input.peek(); | ||
| 158 | while (pred.test(ch)) { | ||
| 159 | res = combine.apply(res, input.next()); | ||
| 160 | ch = input.peek(); | ||
| 161 | } | ||
| 162 | return res; | ||
| 163 | } | ||
| 164 | |||
| 165 | private String readWhile(Predicate<Codepoint> pred) { | ||
| 166 | return foldWhile(pred, new StringBuilder(), StringBuilder::append).toString(); | ||
| 167 | } | ||
| 168 | |||
| 169 | private void skipWhile(Predicate<Codepoint> pred) { | ||
| 170 | foldWhile(pred, Object.class, (x, _) -> x); | ||
| 171 | } | ||
| 172 | |||
| 173 | private String readIdentifier() { | ||
| 174 | return readWhile(Lexer::isIdentFinal); | ||
| 175 | } | ||
| 176 | |||
| 177 | private String readInteger() { | ||
| 178 | return readWhile(Lexer::isNumeral); | ||
| 179 | } | ||
| 180 | |||
| 181 | private String readString() { | ||
| 182 | input.next(); | ||
| 183 | var literal = readWhile(cp -> cp.cp() != '"'); | ||
| 184 | input.next(); | ||
| 185 | return literal; | ||
| 186 | } | ||
| 187 | |||
| 188 | private void skipWhitespace() { | ||
| 189 | while (true) { | ||
| 190 | skipWhile(Lexer::isWhitespace); | ||
| 191 | if (input.peek().cp() != '#') { | ||
| 192 | return; | ||
| 193 | } | ||
| 194 | skipWhile(cp -> !isNewline(cp)); | ||
| 195 | } | ||
| 196 | } | ||
| 197 | } | ||
diff --git a/lexer/src/main/java/lv/enes/orang/lexer/Token.java b/lexer/src/main/java/lv/enes/orang/lexer/Token.java new file mode 100644 index 0000000..59626c7 --- /dev/null +++ b/lexer/src/main/java/lv/enes/orang/lexer/Token.java | |||
| @@ -0,0 +1,62 @@ | |||
| 1 | package lv.enes.orang.lexer; | ||
| 2 | |||
| 3 | import lv.enes.orang.utils.Codepoint; | ||
| 4 | |||
| 5 | public record Token(Type type, String literal) { | ||
| 6 | public Token(Type type, Codepoint... cps) { | ||
| 7 | this(type, codepointsToString(cps)); | ||
| 8 | } | ||
| 9 | |||
| 10 | private static String codepointsToString(Codepoint... cps) { | ||
| 11 | var sb = new StringBuilder(cps.length); | ||
| 12 | for (var cp : cps) { | ||
| 13 | sb.append(cp); | ||
| 14 | } | ||
| 15 | return sb.toString(); | ||
| 16 | } | ||
| 17 | |||
| 18 | public enum Type { | ||
| 19 | ILLEGAL, | ||
| 20 | EOF, | ||
| 21 | |||
| 22 | // Literals | ||
| 23 | IDENTIFIER, | ||
| 24 | INTEGER, | ||
| 25 | STRING, | ||
| 26 | |||
| 27 | // Keywords | ||
| 28 | AND, | ||
| 29 | DEF, | ||
| 30 | DO, | ||
| 31 | ELSE, | ||
| 32 | END, | ||
| 33 | FALSE, | ||
| 34 | FN, | ||
| 35 | IF, | ||
| 36 | IN, | ||
| 37 | LET, | ||
| 38 | THEN, | ||
| 39 | TRUE, | ||
| 40 | |||
| 41 | // Special chars | ||
| 42 | ASTERISK, | ||
| 43 | BANG, | ||
| 44 | BRACKET_LEFT, | ||
| 45 | BRACKET_RIGHT, | ||
| 46 | COMMA, | ||
| 47 | EQUAL, | ||
| 48 | GREATER, | ||
| 49 | GREATER_EQUAL, | ||
| 50 | LESS, | ||
| 51 | LESS_EQUAL, | ||
| 52 | MINUS, | ||
| 53 | MINUS_GREATER, | ||
| 54 | PAREN_LEFT, | ||
| 55 | PAREN_RIGHT, | ||
| 56 | PLUS, | ||
| 57 | QUESTION_EQUAL, | ||
| 58 | SEMICOLON, | ||
| 59 | SLASH, | ||
| 60 | SLASH_EQUAL, | ||
| 61 | } | ||
| 62 | } | ||
diff --git a/lexer/src/main/java/module-info.java b/lexer/src/main/java/module-info.java new file mode 100644 index 0000000..a57a694 --- /dev/null +++ b/lexer/src/main/java/module-info.java | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | module lv.enes.orang.lexer { | ||
| 2 | exports lv.enes.orang.lexer; | ||
| 3 | |||
| 4 | requires lv.enes.orang.utils; | ||
| 5 | } \ No newline at end of file | ||