Synopsis - Cross-Reference
File:
src/Synopsis/Lexer.cc 1
2
3
4
5
6
7
8#include "Synopsis/Lexer.hh"
9#include "Synopsis/Buffer.hh"
10#include <iostream>
11#include <cassert>
12#include <string>
13
14using namespace Synopsis;
15
16Lexer::Lexer(Buffer *buffer, int tokenset)
17 : my_buffer(buffer),
18 my_token(my_buffer->ptr(), 0, '\n')
19{
20 my_keywords["asm"] = Token::ATTRIBUTE;
21 my_keywords["auto"] = Token::AUTO;
22 my_keywords["break"] = Token::BREAK;
23 my_keywords["case"] = Token::CASE;
24 my_keywords["char"] = Token::CHAR;
25 // FIXME: Add support for _Complex to Parser.
26 my_keywords["_Complex"] = Token::Ignore;
27 my_keywords["const"] = Token::CONST;
28 my_keywords["continue"] = Token::CONTINUE;
29 my_keywords["default"] = Token::DEFAULT;
30 my_keywords["do"] = Token::DO;
31 my_keywords["double"] = Token::DOUBLE;
32 my_keywords["else"] = Token::ELSE;
33 my_keywords["enum"] = Token::ENUM;
34 my_keywords["extern"] = Token::EXTERN;
35 my_keywords["float"] = Token::FLOAT;
36 my_keywords["for"] = Token::FOR;
37 my_keywords["goto"] = Token::GOTO;
38 my_keywords["if"] = Token::IF;
39 my_keywords["inline"] = Token::INLINE;
40 my_keywords["int"] = Token::INT;
41 my_keywords["long"] = Token::LONG;
42 my_keywords["register"] = Token::REGISTER;
43 my_keywords["return"] = Token::RETURN;
44 my_keywords["short"] = Token::SHORT;
45 my_keywords["signed"] = Token::SIGNED;
46 my_keywords["sizeof"] = Token::SIZEOF;
47 my_keywords["static"] = Token::STATIC;
48 my_keywords["struct"] = Token::STRUCT;
49 my_keywords["switch"] = Token::SWITCH;
50 my_keywords["typedef"] = Token::TYPEDEF;
51 my_keywords["union"] = Token::UNION;
52 my_keywords["unsigned"] = Token::UNSIGNED;
53 my_keywords["void"] = Token::VOID;
54 my_keywords["volatile"] = Token::VOLATILE;
55 my_keywords["while"] = Token::WHILE;
56 if (tokenset & CXX)
57 {
58 my_keywords["bool"] = Token::BOOLEAN;
59 my_keywords["catch"] = Token::CATCH;
60 my_keywords["class"] = Token::CLASS;
61 my_keywords["delete"] = Token::DELETE;
62 my_keywords["false"] = Token::Constant;
63 my_keywords["friend"] = Token::FRIEND;
64 my_keywords["mutable"] = Token::MUTABLE;
65 my_keywords["namespace"] = Token::NAMESPACE;
66 my_keywords["new"] = Token::NEW;
67 my_keywords["operator"] = Token::OPERATOR;
68 my_keywords["private"] = Token::PRIVATE;
69 my_keywords["protected"] = Token::PROTECTED;
70 my_keywords["public"] = Token::PUBLIC;
71 my_keywords["template"] = Token::TEMPLATE;
72 my_keywords["this"] = Token::THIS;
73 my_keywords["throw"] = Token::THROW;
74 my_keywords["true"] = Token::Constant;
75 my_keywords["try"] = Token::TRY;
76 my_keywords["typeid"] = Token::TYPEID;
77 my_keywords["typename"] = Token::TYPENAME;
78 my_keywords["using"] = Token::USING;
79 my_keywords["virtual"] = Token::VIRTUAL;
80 my_keywords["wchar_t"] = Token::WCHAR;
81 }
82 if (tokenset & GCC)
83 {
84 my_keywords["__alignof__"] = Token::SIZEOF;
85 my_keywords["__asm"] = Token::ATTRIBUTE;
86 my_keywords["__asm__"] = Token::ATTRIBUTE;
87 my_keywords["__attribute__"] = Token::ATTRIBUTE;
88 my_keywords["__builtin_offsetof"] = Token::OFFSETOF;
89 my_keywords["__builtin_va_arg"] = Token::EXTENSION; // Is this correct ?
90 my_keywords["__complex__"] = Token::Ignore;
91 my_keywords["__const"] = Token::CONST;
92 my_keywords["__extension__"] = Token::EXTENSION;
93 my_keywords["__imag__"] = Token::Ignore;
94 my_keywords["__inline"] = Token::INLINE;
95 my_keywords["__inline__"] = Token::INLINE;
96 my_keywords["__real__"] = Token::Ignore;
97 my_keywords["__restrict"] = Token::Ignore;
98 my_keywords["__restrict__"] = Token::Ignore;
99 my_keywords["__signed"] = Token::SIGNED;
100 my_keywords["__signed__"] = Token::SIGNED;
101 my_keywords["typeof"] = Token::TYPEOF;
102 my_keywords["__typeof"] = Token::TYPEOF;
103 my_keywords["__typeof__"] = Token::TYPEOF;
104 }
105 if (tokenset & MSVC)
106 {
107 my_keywords["cdecl"] = Token::Ignore;
108 my_keywords["_cdecl"] = Token::Ignore;
109 my_keywords["__cdecl"] = Token::Ignore;
110 my_keywords["_fastcall"] = Token::Ignore;
111 my_keywords["__fastcall"] = Token::Ignore;
112 my_keywords["_stdcall"] = Token::Ignore;
113 my_keywords["__stdcall"] = Token::Ignore;
114 my_keywords["__thiscall"] = Token::Ignore;
115 my_keywords["_based"] = Token::Ignore;
116 my_keywords["__based"] = Token::Ignore;
117 my_keywords["_asm"] = Token::ASM;
118 my_keywords["__asm"] = Token::ASM;
119 my_keywords["_inline"] = Token::INLINE;
120 my_keywords["__inline"] = Token::INLINE;
121 my_keywords["__declspec"] = Token::DECLSPEC;
122 my_keywords["__pragma"] = Token::PRAGMA;
123 my_keywords["__int8"] = Token::CHAR;
124 my_keywords["__int16"] = Token::SHORT;
125 my_keywords["__int32"] = Token::INT;
126 my_keywords["__int64"] = Token::INT64;
127 my_keywords["__w64"] = Token::Ignore;
128 }
129}
130
131Token::Type Lexer::get_token(Token &t)
132{
133 if (!fill(1)) return Token::BadToken;
134 t = my_tokens.front();
135 my_tokens.pop();
136 return t.type;
137}
138
139Token::Type Lexer::look_ahead(size_t offset)
140{
141 if (!fill(offset + 1)) return Token::BadToken;
142 return my_tokens.at(offset).type;
143}
144
145Token::Type Lexer::look_ahead(size_t offset, Token &t)
146{
147 if (!fill(offset + 1)) return Token::BadToken;
148 t = my_tokens.at(offset);
149 return t.type;
150}
151
152const char *Lexer::save()
153{
154 if (!fill(1)) throw std::runtime_error("unexpected EOF");
155 Token current = my_tokens.front();
156 return current.ptr;
157}
158
159void Lexer::restore(const char *pos)
160{
161 my_token.type = '\n';
162 my_token.ptr = my_buffer->ptr();
163 my_token.length = 0;
164 my_tokens.clear();
165 rewind(pos);
166}
167
168unsigned long Lexer::origin(const char *ptr, std::string &filename) const
169{
170 return my_buffer->origin(ptr, filename);
171}
172
173void Lexer::rewind(const char *p)
174{
175 my_buffer->reset(p - my_buffer->ptr());
176}
177
178Token::Type Lexer::read_token(const char *&ptr, size_t &length)
179{
180 Token::Type t = Token::BadToken;
181 while(true)
182 {
183 t = read_line();
184 if(t == Token::Ignore) continue;
185 my_token.type = t;
186
187 if(t == Token::ATTRIBUTE)
188 {
189 skip_attribute();
190 continue;
191 }
192 else if(t == Token::EXTENSION)
193 {
194 t = skip_extension(ptr, length);
195 if(t == Token::Ignore) continue;
196 else return t;
197 }
198 else if(t == Token::ASM)
199 {
200 skip_asm();
201 continue;
202 }
203 else if(t == Token::DECLSPEC)
204 {
205 skip_declspec();
206 continue;
207 }
208 else if(t == Token::PRAGMA)
209 {
210 skip_pragma();
211 continue;
212 }
213 if(t != '\n') break;
214 }
215
216 ptr = my_token.ptr;
217 length = my_token.length;
218 return t;
219}
220
221bool Lexer::fill(size_t o)
222{
223 while (my_tokens.size() < o)
224 {
225 Token t;
226 t.type = read_token(t.ptr, t.length);
227 if (t.type == Token::BadToken) return false;
228 my_tokens.push(t);
229 }
230 return true;
231}
232
233void Lexer::skip_attribute()
234{
235 char c;
236 do { c = my_buffer->get();}
237 while(c != '(' && c != '\0');
238 if (c == '\0') return;
239 skip_paren();
240}
241
242Token::Type Lexer::skip_extension(const char *&ptr, size_t &length)
243{
244 ptr = my_token.ptr;
245 length = my_token.length;
246
247 char c;
248 do { c = my_buffer->get();}
249 while(is_blank(c) || c == '\n');
250
251 if(c != '(')
252 {
253 my_buffer->unget();
254 return Token::Ignore;
255 }
256 skip_paren();
257 return Token::Identifier;
258}
259
260inline bool check_end_of_instruction(Buffer *buffer, char c, const char *delimiter)
261{
262 if (c == '\0') return true;
263 if (strchr(delimiter, c))
264 {
265 buffer->unget();
266 return true;
267 }
268 return false;
269}
270
271void Lexer::skip_paren()
272{
273 size_t i = 1;
274 do
275 {
276 char c = my_buffer->get();
277 if (c == '\0') return;
278 if(c == '(') ++i;
279 else if(c == ')') --i;
280 } while(i > 0);
281}
282
283void Lexer::skip_line()
284{
285 char c;
286 do { c = my_buffer->get();}
287 while(c != '\n' && c != '\0');
288}
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305void Lexer::skip_asm()
306{
307 char c;
308
309 do
310 {
311 c = my_buffer->get();
312 if (check_end_of_instruction(my_buffer, c, "")) return;
313 }
314 while(is_blank(c) || c == '\n');
315
316 if(c == '{')
317 {
318 size_t i = 1;
319 do
320 {
321 c = my_buffer->get();
322 if (check_end_of_instruction(my_buffer, c, "")) return;
323 if(c == '{') ++i;
324 else if(c == '}') --i;
325 } while(i > 0);
326 }
327 else
328 {
329 while(true)
330 {
331 if (check_end_of_instruction(my_buffer, c, "}\n")) return;
332 c = my_buffer->get();
333 }
334 }
335}
336
337void Lexer::skip_declspec()
338{
339 char c;
340 do
341 {
342 c = my_buffer->get();
343 if (check_end_of_instruction(my_buffer, c, "")) return;
344 } while(is_blank(c));
345
346 if (c == '(')
347 {
348 size_t i = 1;
349 do
350 {
351 c = my_buffer->get();
352 if (check_end_of_instruction(my_buffer, c, "};")) return;
353 if(c == '(') ++i;
354 else if(c == ')') --i;
355 } while(i > 0);
356 }
357}
358
359void Lexer::skip_pragma()
360{
361 char c = get_next_non_white_char();
362
363 if (c == '(')
364 {
365 size_t i = 1;
366 do
367 {
368 c = my_buffer->get();
369 if (check_end_of_instruction(my_buffer, c, "};")) return;
370 if(c == '(') ++i;
371 else if(c == ')') --i;
372 } while(i > 0);
373
374 c = get_next_non_white_char();
375 }
376}
377
378char Lexer::get_next_non_white_char()
379{
380 char c;
381 while(true)
382 {
383 do { c = my_buffer->get();}
384 while(is_blank(c));
385
386 if(c != '\\') break;
387
388 c = my_buffer->get();
389 if(c != '\n' && c!= '\r')
390 {
391 my_buffer->unget();
392 break;
393 }
394 }
395 return c;
396}
397
398Token::Type Lexer::read_line()
399{
400 char c = get_next_non_white_char();
401 unsigned long top = my_buffer->position();
402 my_token.ptr = my_buffer->ptr(top);
403 if(c == '\0')
404 {
405 my_buffer->unget();
406 return '\0';
407 }
408 else if(c == '\n') return '\n';
409 else if(c == '#' && my_token.type == '\n')
410 {
411 skip_line();
412 return '\n';
413 }
414 else if(c == '\'' || c == '"')
415 {
416 if(c == '\'')
417 {
418 if(read_char_const(top)) return Token::CharConst;
419 }
420 else
421 {
422 if(read_str_const(top)) return Token::StringL;
423 }
424 my_buffer->reset(top + 1);
425 my_token.length = 1;
426 return single_char_op(c);
427 }
428 else if(is_digit(c)) return read_number(c, top);
429 else if(c == '.')
430 {
431 c = my_buffer->get();
432 if(is_digit(c)) return read_float(top);
433 else
434 {
435 my_buffer->unget();
436 return read_separator('.', top);
437 }
438 }
439 else if(is_letter(c))
440 {
441 if (c == 'L')
442 {
443 c = my_buffer->get();
444 if (c == '\'' || c == '"')
445 {
446 if (c == '\'')
447 {
448 if (read_char_const(top+1))
449 {
450 ++my_token.length;
451 return Token::WideCharConst;
452 }
453 }
454 else
455 {
456 if(read_str_const(top+1))
457 {
458 ++my_token.length;
459 return Token::WideStringL;
460 }
461 }
462 }
463 my_buffer->reset(top);
464 }
465 return read_identifier(top);
466 }
467 else return read_separator(c, top);
468}
469
470bool Lexer::read_char_const(unsigned long top)
471{
472 while(true)
473 {
474 char c = my_buffer->get();
475 if(c == '\\')
476 {
477 c = my_buffer->get();
478 if(c == '\0') return false;
479 }
480 else if(c == '\'')
481 {
482 my_token.length = static_cast<size_t>(my_buffer->position() - top + 1);
483 return true;
484 }
485 else if(c == '\n' || c == '\0') return false;
486 }
487}
488
489
490
491
492
493
494bool Lexer::read_str_const(unsigned long top)
495{
496 // Skip the L if there is one
497 if (my_buffer->at(top) == 'L') my_buffer->get();
498 while(true)
499 {
500 char c = my_buffer->get();
501 if(c == '\\')
502 {
503 c = my_buffer->get();
504 if(c == '\0') return false;
505 }
506 else if(c == '"')
507 {
508 // We are past one string literal token now.
509 // Any following whitespace needs to be skipped
510 // before looking for anything else.
511 unsigned long pos = my_buffer->position() + 1;
512 while (true)
513 {
514 int nline = 0;
515 // Consume whitespace.
516 do
517 {
518 c = my_buffer->get();
519 if(c == '\n') ++nline;
520 } while(is_blank(c) || c == '\n');
521 // Consume comment.
522 if (c == '/')
523 {
524 char d = my_buffer->get();
525 if (d == '/' || d == '*')
526 read_comment(d, my_buffer->position() - 2);
527 else
528 {
529 my_buffer->unget();
530 break;
531 }
532 }
533 else break;
534 }
535 if(c == '"')
536 /* line_number += nline; */ ;
537 else
538 {
539 my_token.length = static_cast<size_t>(pos - top);
540 my_buffer->reset(pos);
541 return true;
542 }
543 }
544 else if(c == '\n' || c == '\0') return false;
545 }
546}
547
548Token::Type Lexer::read_number(char c, unsigned long top)
549{
550 char c2 = my_buffer->get();
551
552 if(c == '0' && is_xletter(c2))
553 {
554 do { c = my_buffer->get();}
555 while(is_hexdigit(c));
556 while(is_int_suffix(c)) c = my_buffer->get();
557
558 my_buffer->unget();
559 my_token.length = static_cast<size_t>(my_buffer->position() - top + 1);
560 return Token::Constant;
561 }
562
563 while(is_digit(c2)) c2 = my_buffer->get();
564
565 if(is_int_suffix(c2))
566 do { c2 = my_buffer->get();}
567 while(is_int_suffix(c2));
568 else if(c2 == '.') return read_float(top);
569 else if(is_eletter(c2))
570 {
571 my_buffer->unget();
572 return read_float(top);
573 }
574
575 my_buffer->unget();
576 my_token.length = static_cast<size_t>(my_buffer->position() - top + 1);
577 return Token::Constant;
578}
579
580Token::Type Lexer::read_float(unsigned long top)
581{
582 char c;
583
584 do { c = my_buffer->get();}
585 while(is_digit(c));
586 if(is_float_suffix(c))
587 do { c = my_buffer->get();}
588 while(is_float_suffix(c));
589 else if(is_eletter(c))
590 {
591 unsigned long p = my_buffer->position();
592 c = my_buffer->get();
593 if(c == '+' || c == '-')
594 {
595 c = my_buffer->get();
596 if(!is_digit(c))
597 {
598