lexer source code

1 /*
2 Copyright (c) 2011-2014 Timur Gafarov 
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 module lexer;
30 
31 import std..string;
32 import std.ascii;
33 import std.conv;
34 import std.algorithm;
35 
36 static string[] stddelimiters = 
37 [
38     "==","!=","<=",">=","+=","-=","*=","/=",
39     "++","--","||","&&","<<",">>","<>",
40     "//","/*","*/","\\\\","\\\"","\\\'",
41     "+","-","*","/","%","=","|","^","~","<",">","!",
42     "(",")","{","}","[","]",
43     ";",":",",","@","#","$","&",
44     "\\","\"","\'"
45 ];
46 
47 /* 
48  * Helper functions
49  */
50 
51 bool matches(T)(T input, T[] list)
52 {
53     foreach(v; list)
54         if (input==to!T(v)) return true;
55     return false;
56 }
57 
58 bool isWhitespace(string input)
59 {
60     if (input.length==0) return false;
61     else if (matches(input[0], std.ascii.whitespace)) return true;
62     return false;
63 }
64 
65 /* 
66  * General-purpose lexical analyther
67  */
68 
69 final class Lexer
70 {
71    private:
72     string source;
73     int pos = 0;
74     string[] delimiters;
75 	
76    public:
77     string singleLineComment = "//";
78     string multiLineCommentBegin = "/*";
79     string multiLineCommentEnd = "*/";
80     string[] stringLiteralQuote = ["\"","\'"];
81 
82     string current;
83 
84     uint line = 1;
85     uint lastTokenLine = 0;
86     bool noTokensOnCurrentLine = true;
87 	
88     this(string src)
89     {
90         source = src;
91     }
92 
93     void addDelimiter(string op)
94     {
95         delimiters ~= op;
96         sort!("a.length > b.length")(delimiters);
97     }
98 
99     void addDelimiters(string[] op = stddelimiters)
100     {
101         delimiters ~= op;
102         sort!("a.length > b.length")(delimiters);
103     }
104 
105     void readNext()
106     {
107         current = getLexeme();
108     }
109 
110     string getLexeme()
111     {
112         string result;
113         bool commentSingleLine = false;
114         bool commentMultiLine = false;
115         bool stringLiteral = false;
116         string tempStringLiteral = "";
117         bool satisfied = false;
118 
119         while(!satisfied)
120         {
121             string lexeme = getLexemeUnfiltered();
122             if (!lexeme) satisfied = true;
123             else if (lexeme == "\n") 
124             { 
125                 if (!commentMultiLine)
126                 {
127                     if (!stringLiteral) commentSingleLine = false; 
128                     else tempStringLiteral ~= lexeme;
129                 }
130                 line++;
131                 noTokensOnCurrentLine = true;
132             }
133             else if (lexeme == singleLineComment && !commentMultiLine) 
134             {
135                 if (!stringLiteral)
136                     commentSingleLine = true;
137             }
138             else if (!commentSingleLine)
139             {
140                 if (lexeme == multiLineCommentBegin) { if (!stringLiteral) commentMultiLine = true; }
141                 else if (lexeme == multiLineCommentEnd) { if (!stringLiteral) commentMultiLine = false; }
142                 else if (!commentMultiLine)
143                 { 
144                     if (matches(lexeme, stringLiteralQuote)) 
145                     {
146                         tempStringLiteral ~= lexeme;
147                         if (stringLiteral)
148                         {
149                             if (lexeme[0] == tempStringLiteral[0]) 
150                             {
151                                 result = tempStringLiteral; 
152                                 stringLiteral = false; 
153                                 satisfied = true; 
154                             }
155                         }
156                         else stringLiteral = true;
157                     }
158                     else
159                     {
160                         if (stringLiteral)
161                             tempStringLiteral ~= lexeme;
162                         else if (!lexeme.isWhitespace)
163                         {
164                             if (!commentSingleLine && !commentMultiLine) 
165                             {
166                                 result = lexeme;
167                                 satisfied = true;
168                             }
169                         }
170                     }
171                 }
172             }
173         }
174 
175         if (result != "" && noTokensOnCurrentLine)
176         {
177             lastTokenLine = line;
178             noTokensOnCurrentLine = false;
179         }
180 
181 	return result;
182     }
183 
184    private:
185     string getLexemeUnfiltered()
186     {
187         string temp;
188         while (pos < source.length) 
189         {
190             string forw = matchForward(pos, delimiters);
191             if (source[pos] == '\n')
192             {
193                 if (!temp) { temp ~= "\n"; pos++; }
194                 break;
195             }
196             else if (matches(source[pos], std.ascii.whitespace)) 
197             {
198                 if (!temp) { temp ~= source[pos]; pos++; }
199                 break;
200             }
201             else if (forw.length > 0)
202             {
203                 if (!temp)
204                 { 
205                     temp ~= forw; 
206                     pos += forw.length;
207                     break;
208                 }
209                 else break;
210             }
211             else
212             {
213                 temp ~= source[pos];
214                 pos++;
215             }
216         }
217         return temp;
218     }
219 
220     string matchForward(size_t start, string[] list)
221     {
222         foreach(v; list)
223         {
224             string forward = getForward(start,v.length);
225             if (forward == v) return forward;
226         }
227         return "";
228     }
229 
230     string getForward(size_t position, size_t num)
231     {
232         if (position + num < source.length)
233              return source[position..position+num];
234         else 
235              return source[position..$];
236     }
237 }
238