summaryrefslogtreecommitdiff
path: root/lexer
diff options
context:
space:
mode:
Diffstat (limited to 'lexer')
-rw-r--r--lexer/build.gradle.kts19
-rw-r--r--lexer/src/main/java/lv/enes/orang/lexer/Lexer.java197
-rw-r--r--lexer/src/main/java/lv/enes/orang/lexer/Token.java62
-rw-r--r--lexer/src/main/java/module-info.java5
4 files changed, 283 insertions, 0 deletions
diff --git a/lexer/build.gradle.kts b/lexer/build.gradle.kts
new file mode 100644
index 0000000..7fe8777
--- /dev/null
+++ b/lexer/build.gradle.kts
@@ -0,0 +1,19 @@
1plugins {
2 java
3}
4
5dependencies {
6 implementation(project(":utils"))
7}
8
9java {
10 sourceCompatibility = JavaVersion.VERSION_22
11 targetCompatibility = JavaVersion.VERSION_22
12 toolchain {
13 languageVersion = JavaLanguageVersion.of(22)
14 }
15}
16
17tasks.withType<JavaCompile> {
18 options.compilerArgs.add("--enable-preview")
19} \ No newline at end of file
diff --git a/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java b/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java
new file mode 100644
index 0000000..8fec98e
--- /dev/null
+++ b/lexer/src/main/java/lv/enes/orang/lexer/Lexer.java
@@ -0,0 +1,197 @@
1package lv.enes.orang.lexer;
2
3import lv.enes.orang.utils.Codepoint;
4import lv.enes.orang.utils.PeekableStream;
5
6import java.io.*;
7import java.util.Iterator;
8import java.util.function.BiFunction;
9import java.util.function.Predicate;
10import java.util.stream.IntStream;
11import java.util.stream.Stream;
12
13public class Lexer implements Iterator<Token> {
14 public static boolean isIdentInitial(Codepoint cp) {
15 return Character.isLetter(cp.cp()) || cp.cp() == '_';
16 }
17
18 public static boolean isIdentFinal(Codepoint cp) {
19 return isIdentInitial(cp) || Character.isDigit(cp.cp());
20 }
21
22 public static boolean isNewline(Codepoint cp) {
23 return cp.cp() == '\n';
24 }
25
26 public static boolean isNumeral(Codepoint cp) {
27 return Character.isDigit(cp.cp());
28 }
29
30 public static boolean isWhitespace(Codepoint cp) {
31 return Character.isWhitespace(cp.cp());
32 }
33
34 private final PeekableStream<Codepoint> input;
35
36 public Lexer(InputStream input) {
37 this(new InputStreamReader(input));
38 }
39
40 public Lexer(Reader input) {
41 var cpStream = new BufferedReader(input)
42 .lines()
43 .flatMapToInt(str -> IntStream.concat(str.codePoints(), IntStream.of('\n')))
44 .mapToObj(Codepoint::new);
45 var theEof = Stream.of(new Codepoint(-1));
46 this.input = new PeekableStream<>(Stream.concat(cpStream, theEof).iterator());
47 }
48
49 public Lexer(String input) {
50 this(new StringReader(input));
51 }
52
53 private boolean hasNext = true;
54
55 @Override
56 public Token next() {
57 var tok = nextToken();
58 if (tok.type() == Token.Type.EOF) {
59 hasNext = false;
60 }
61 return tok;
62 }
63
64 @Override
65 public boolean hasNext() {
66 return hasNext;
67 }
68
69 private Token nextToken() {
70 skipWhitespace();
71 return switch (input.peek().cp()) {
72 case -1 -> new Token(Token.Type.EOF, "");
73
74 case '*' -> new Token(Token.Type.ASTERISK, input.next());
75 case '!' -> new Token(Token.Type.BANG, input.next());
76 case '[' -> new Token(Token.Type.BRACKET_LEFT, input.next());
77 case ']' -> new Token(Token.Type.BRACKET_RIGHT, input.next());
78 case ',' -> new Token(Token.Type.COMMA, input.next());
79 case '=' -> new Token(Token.Type.EQUAL, input.next());
80 case '>' -> {
81 var first = input.next();
82 if (input.peek().cp() == '=') {
83 yield new Token(Token.Type.GREATER_EQUAL, first, input.next());
84 } else {
85 yield new Token(Token.Type.GREATER, first);
86 }
87 }
88 case '<' -> {
89 var first = input.next();
90 if (input.peek().cp() == '=') {
91 yield new Token(Token.Type.LESS_EQUAL, first, input.next());
92 } else {
93 yield new Token(Token.Type.LESS, first);
94 }
95 }
96 case '-' -> {
97 var first = input.next();
98 if (input.peek().cp() == '>') {
99 yield new Token(Token.Type.MINUS_GREATER, first, input.next());
100 } else {
101 yield new Token(Token.Type.MINUS, first);
102 }
103 }
104 case '(' -> new Token(Token.Type.PAREN_LEFT, input.next());
105 case ')' -> new Token(Token.Type.PAREN_RIGHT, input.next());
106 case '+' -> new Token(Token.Type.PLUS, input.next());
107 case '?' -> {
108 var first = input.next();
109 if (input.peek().cp() == '=') {
110 yield new Token(Token.Type.QUESTION_EQUAL, first, input.next());
111 } else {
112 yield new Token(Token.Type.ILLEGAL, first, input.next());
113 }
114 }
115 case ';' -> new Token(Token.Type.SEMICOLON, input.next());
116 case '/' -> {
117 var first = input.next();
118 if (input.peek().cp() == '=') {
119 yield new Token(Token.Type.SLASH_EQUAL, first, input.next());
120 } else {
121 yield new Token(Token.Type.SLASH, first);
122 }
123 }
124
125 case '"' -> new Token(Token.Type.STRING, readString());
126
127 default -> {
128 if (isIdentInitial(input.peek())) {
129 var ident = readIdentifier();
130 var type = switch (ident) {
131 case "and" -> Token.Type.AND;
132 case "def" -> Token.Type.DEF;
133 case "do" -> Token.Type.DO;
134 case "else" -> Token.Type.ELSE;
135 case "end" -> Token.Type.END;
136 case "false" -> Token.Type.FALSE;
137 case "fn" -> Token.Type.FN;
138 case "if" -> Token.Type.IF;
139 case "in" -> Token.Type.IN;
140 case "let" -> Token.Type.LET;
141 case "then" -> Token.Type.THEN;
142 case "true" -> Token.Type.TRUE;
143 default -> Token.Type.IDENTIFIER;
144 };
145 yield new Token(type, ident);
146 } else if (isNumeral(input.peek())) {
147 yield new Token(Token.Type.INTEGER, readInteger());
148 } else {
149 yield new Token(Token.Type.ILLEGAL, input.next());
150 }
151 }
152 };
153 }
154
155 private <T> T foldWhile(Predicate<Codepoint> pred, T initial, BiFunction<T, Codepoint, T> combine) {
156 var res = initial;
157 var ch = input.peek();
158 while (pred.test(ch)) {
159 res = combine.apply(res, input.next());
160 ch = input.peek();
161 }
162 return res;
163 }
164
165 private String readWhile(Predicate<Codepoint> pred) {
166 return foldWhile(pred, new StringBuilder(), StringBuilder::append).toString();
167 }
168
169 private void skipWhile(Predicate<Codepoint> pred) {
170 foldWhile(pred, Object.class, (x, _) -> x);
171 }
172
173 private String readIdentifier() {
174 return readWhile(Lexer::isIdentFinal);
175 }
176
177 private String readInteger() {
178 return readWhile(Lexer::isNumeral);
179 }
180
181 private String readString() {
182 input.next();
183 var literal = readWhile(cp -> cp.cp() != '"');
184 input.next();
185 return literal;
186 }
187
188 private void skipWhitespace() {
189 while (true) {
190 skipWhile(Lexer::isWhitespace);
191 if (input.peek().cp() != '#') {
192 return;
193 }
194 skipWhile(cp -> !isNewline(cp));
195 }
196 }
197}
diff --git a/lexer/src/main/java/lv/enes/orang/lexer/Token.java b/lexer/src/main/java/lv/enes/orang/lexer/Token.java
new file mode 100644
index 0000000..59626c7
--- /dev/null
+++ b/lexer/src/main/java/lv/enes/orang/lexer/Token.java
@@ -0,0 +1,62 @@
1package lv.enes.orang.lexer;
2
3import lv.enes.orang.utils.Codepoint;
4
5public record Token(Type type, String literal) {
6 public Token(Type type, Codepoint... cps) {
7 this(type, codepointsToString(cps));
8 }
9
10 private static String codepointsToString(Codepoint... cps) {
11 var sb = new StringBuilder(cps.length);
12 for (var cp : cps) {
13 sb.append(cp);
14 }
15 return sb.toString();
16 }
17
18 public enum Type {
19 ILLEGAL,
20 EOF,
21
22 // Literals
23 IDENTIFIER,
24 INTEGER,
25 STRING,
26
27 // Keywords
28 AND,
29 DEF,
30 DO,
31 ELSE,
32 END,
33 FALSE,
34 FN,
35 IF,
36 IN,
37 LET,
38 THEN,
39 TRUE,
40
41 // Special chars
42 ASTERISK,
43 BANG,
44 BRACKET_LEFT,
45 BRACKET_RIGHT,
46 COMMA,
47 EQUAL,
48 GREATER,
49 GREATER_EQUAL,
50 LESS,
51 LESS_EQUAL,
52 MINUS,
53 MINUS_GREATER,
54 PAREN_LEFT,
55 PAREN_RIGHT,
56 PLUS,
57 QUESTION_EQUAL,
58 SEMICOLON,
59 SLASH,
60 SLASH_EQUAL,
61 }
62}
diff --git a/lexer/src/main/java/module-info.java b/lexer/src/main/java/module-info.java
new file mode 100644
index 0000000..a57a694
--- /dev/null
+++ b/lexer/src/main/java/module-info.java
@@ -0,0 +1,5 @@
1module lv.enes.orang.lexer {
2 exports lv.enes.orang.lexer;
3
4 requires lv.enes.orang.utils;
5} \ No newline at end of file