Synopsis - Cross-Reference

File: src/Synopsis/Lexer.cc
  1//
  2// Copyright (C) 2004 Stefan Seefeld
  3// All rights reserved.
  4// Licensed to the public under the terms of the GNU LGPL (>= 2),
  5// see the file COPYING for details.
  6//
  7
  8#include "Synopsis/Lexer.hh"
  9#include "Synopsis/Buffer.hh"
 10#include <iostream>
 11#include <cassert>
 12#include <string>
 13
 14using namespace Synopsis;
 15
 16Lexer::Lexer(Buffer *buffer, int tokenset)
 17  : my_buffer(buffer),
 18    my_token(my_buffer->ptr(), 0, '\n')
 19{
 20  my_keywords["asm"] = Token::ATTRIBUTE;
 21  my_keywords["auto"] = Token::AUTO;
 22  my_keywords["break"] = Token::BREAK;
 23  my_keywords["case"] = Token::CASE;
 24  my_keywords["char"] = Token::CHAR;
 25  // FIXME: Add support for _Complex to Parser.
 26  my_keywords["_Complex"] = Token::Ignore;
 27  my_keywords["const"] = Token::CONST;
 28  my_keywords["continue"] = Token::CONTINUE;
 29  my_keywords["default"] = Token::DEFAULT;
 30  my_keywords["do"] = Token::DO;
 31  my_keywords["double"] = Token::DOUBLE;
 32  my_keywords["else"] = Token::ELSE;
 33  my_keywords["enum"] = Token::ENUM;
 34  my_keywords["extern"] = Token::EXTERN;
 35  my_keywords["float"] = Token::FLOAT;
 36  my_keywords["for"] = Token::FOR;
 37  my_keywords["goto"] = Token::GOTO;
 38  my_keywords["if"] = Token::IF;
 39  my_keywords["inline"] = Token::INLINE;
 40  my_keywords["int"] = Token::INT;
 41  my_keywords["long"] = Token::LONG;
 42  my_keywords["register"] = Token::REGISTER;
 43  my_keywords["return"] = Token::RETURN;
 44  my_keywords["short"] = Token::SHORT;
 45  my_keywords["signed"] = Token::SIGNED;
 46  my_keywords["sizeof"] = Token::SIZEOF;
 47  my_keywords["static"] = Token::STATIC;
 48  my_keywords["struct"] = Token::STRUCT;
 49  my_keywords["switch"] = Token::SWITCH;
 50  my_keywords["typedef"] = Token::TYPEDEF;
 51  my_keywords["union"] = Token::UNION;
 52  my_keywords["unsigned"] = Token::UNSIGNED;
 53  my_keywords["void"] = Token::VOID;
 54  my_keywords["volatile"] = Token::VOLATILE;
 55  my_keywords["while"] = Token::WHILE;
 56  if (tokenset & CXX)
 57  {
 58    my_keywords["bool"] = Token::BOOLEAN;
 59    my_keywords["catch"] = Token::CATCH;
 60    my_keywords["class"] = Token::CLASS;
 61    my_keywords["delete"] = Token::DELETE;
 62    my_keywords["false"] = Token::Constant;
 63    my_keywords["friend"] = Token::FRIEND;
 64    my_keywords["mutable"] = Token::MUTABLE;
 65    my_keywords["namespace"] = Token::NAMESPACE;
 66    my_keywords["new"] = Token::NEW;
 67    my_keywords["operator"] = Token::OPERATOR;
 68    my_keywords["private"] = Token::PRIVATE;
 69    my_keywords["protected"] = Token::PROTECTED;
 70    my_keywords["public"] = Token::PUBLIC;
 71    my_keywords["template"] = Token::TEMPLATE;
 72    my_keywords["this"] = Token::THIS;
 73    my_keywords["throw"] = Token::THROW;
 74    my_keywords["true"] = Token::Constant;
 75    my_keywords["try"] = Token::TRY;
 76    my_keywords["typeid"] = Token::TYPEID;
 77    my_keywords["typename"] = Token::TYPENAME;
 78    my_keywords["using"] = Token::USING;
 79    my_keywords["virtual"] = Token::VIRTUAL;
 80    my_keywords["wchar_t"] = Token::WCHAR;
 81  }
 82  if (tokenset & GCC)
 83  {
 84    my_keywords["__alignof__"] = Token::SIZEOF;
 85    my_keywords["__asm"] = Token::ATTRIBUTE;
 86    my_keywords["__asm__"] = Token::ATTRIBUTE;
 87    my_keywords["__attribute__"] = Token::ATTRIBUTE;
 88    my_keywords["__builtin_offsetof"] = Token::OFFSETOF;
 89    my_keywords["__builtin_va_arg"] = Token::EXTENSION; // Is this correct ?
 90    my_keywords["__complex__"] = Token::Ignore;
 91    my_keywords["__const"] = Token::CONST;
 92    my_keywords["__extension__"] = Token::EXTENSION;
 93    my_keywords["__imag__"] = Token::Ignore;
 94    my_keywords["__inline"] = Token::INLINE;
 95    my_keywords["__inline__"] = Token::INLINE;
 96    my_keywords["__real__"] = Token::Ignore;
 97    my_keywords["__restrict"] = Token::Ignore;
 98    my_keywords["__restrict__"] = Token::Ignore;
 99    my_keywords["__signed"] = Token::SIGNED;
100    my_keywords["__signed__"] = Token::SIGNED;
101    my_keywords["typeof"] = Token::TYPEOF;
102    my_keywords["__typeof"] = Token::TYPEOF;
103    my_keywords["__typeof__"] = Token::TYPEOF;
104  }
105  if (tokenset & MSVC)
106  {
107    my_keywords["cdecl"] = Token::Ignore;
108    my_keywords["_cdecl"] = Token::Ignore;
109    my_keywords["__cdecl"] = Token::Ignore;
110    my_keywords["_fastcall"] = Token::Ignore;
111    my_keywords["__fastcall"] = Token::Ignore;
112    my_keywords["_stdcall"] = Token::Ignore;
113    my_keywords["__stdcall"] = Token::Ignore;
114    my_keywords["__thiscall"] = Token::Ignore;
115    my_keywords["_based"] = Token::Ignore;
116    my_keywords["__based"] = Token::Ignore;
117    my_keywords["_asm"] = Token::ASM;
118    my_keywords["__asm"] = Token::ASM;
119    my_keywords["_inline"] = Token::INLINE;
120    my_keywords["__inline"] = Token::INLINE;
121    my_keywords["__declspec"] = Token::DECLSPEC;
122    my_keywords["__pragma"] = Token::PRAGMA;
123    my_keywords["__int8"] = Token::CHAR;
124    my_keywords["__int16"] = Token::SHORT;
125    my_keywords["__int32"] = Token::INT;
126    my_keywords["__int64"] = Token::INT64;
127    my_keywords["__w64"] = Token::Ignore;
128  }
129}
130
131Token::Type Lexer::get_token(Token &t)
132{
133  if (!fill(1)) return Token::BadToken;
134  t = my_tokens.front();
135  my_tokens.pop();
136  return t.type;
137}
138
139Token::Type Lexer::look_ahead(size_t offset)
140{
141  if (!fill(offset + 1)) return Token::BadToken;
142  return my_tokens.at(offset).type;
143}
144
145Token::Type Lexer::look_ahead(size_t offset, Token &t)
146{
147  if (!fill(offset + 1)) return Token::BadToken;
148  t = my_tokens.at(offset);
149  return t.type;
150}
151
152const char *Lexer::save()
153{
154  if (!fill(1)) throw std::runtime_error("unexpected EOF");
155  Token current = my_tokens.front();
156  return current.ptr;
157}
158
159void Lexer::restore(const char *pos)
160{
161  my_token.type = '\n';
162  my_token.ptr = my_buffer->ptr();
163  my_token.length = 0;
164  my_tokens.clear();
165  rewind(pos);
166}
167
168unsigned long Lexer::origin(const char *ptr, std::string &filename) const
169{
170  return my_buffer->origin(ptr, filename);
171}
172
173void Lexer::rewind(const char *p)
174{
175  my_buffer->reset(p - my_buffer->ptr());
176}
177
178Token::Type Lexer::read_token(const char *&ptr, size_t &length)
179{
180  Token::Type t = Token::BadToken;
181  while(true)
182  {
183    t = read_line();
184    if(t == Token::Ignore) continue;
185    my_token.type = t;
186
187    if(t == Token::ATTRIBUTE)
188    {
189      skip_attribute();
190      continue;
191    }
192    else if(t == Token::EXTENSION)
193    {
194      t = skip_extension(ptr, length);
195      if(t == Token::Ignore) continue;
196      else return t;
197    }
198    else if(t == Token::ASM)
199    {
200      skip_asm();
201      continue;
202    }
203    else if(t == Token::DECLSPEC)
204    {
205      skip_declspec();
206      continue;
207    }
208    else if(t == Token::PRAGMA)
209    {
210      skip_pragma();
211      continue;
212    }
213    if(t != '\n') break;
214  }
215
216  ptr = my_token.ptr;
217  length = my_token.length;
218  return t;
219}
220
221bool Lexer::fill(size_t o)
222{
223  while (my_tokens.size() < o)
224  {
225    Token t;
226    t.type = read_token(t.ptr, t.length);
227    if (t.type == Token::BadToken) return false;
228    my_tokens.push(t);
229  }
230  return true;
231}
232
233void Lexer::skip_attribute()
234{
235  char c;
236  do { c = my_buffer->get();}
237  while(c != '(' && c != '\0');
238  if (c == '\0') return;
239  skip_paren();
240}
241
242Token::Type Lexer::skip_extension(const char *&ptr, size_t &length)
243{
244  ptr = my_token.ptr;
245  length = my_token.length;
246
247  char c;
248  do { c = my_buffer->get();}
249  while(is_blank(c) || c == '\n');
250
251  if(c != '(')
252  {
253    my_buffer->unget();
254    return Token::Ignore; // if no (..) follows, ignore __extension__
255  }
256  skip_paren();
257  return Token::Identifier; // regards it as the identifier __extension__
258}
259
260inline bool check_end_of_instruction(Buffer *buffer, char c, const char *delimiter)
261{
262  if (c == '\0') return true;
263  if (strchr(delimiter, c))
264  {
265    buffer->unget();
266    return true;
267  }
268  return false;
269}
270
271void Lexer::skip_paren()
272{
273  size_t i = 1;
274  do
275  {
276    char c = my_buffer->get();
277    if (c == '\0') return;
278    if(c == '(') ++i;
279    else if(c == ')') --i;
280  } while(i > 0);
281}
282
283void Lexer::skip_line()
284{
285  char c;
286  do { c = my_buffer->get();}
287  while(c != '\n' && c != '\0');
288}
289
290/* You can have the following :
291
292   Just count the '{' and '}' and it should be ok
293   __asm { mov ax,1
294           mov bx,1 }
295
296   Stop when EOL found. Note that the first ';' after
297   an __asm instruction is an ASM comment !
298   int v; __asm mov ax,1 __asm mov bx,1; v=1;
299
300   Stop when '}' found
301   if (cond) {__asm mov ax,1 __asm mov bx,1}
302
303   and certainly more...
304*/
305void Lexer::skip_asm()
306{
307  char c;
308
309  do
310  {
311    c = my_buffer->get();
312    if (check_end_of_instruction(my_buffer, c, "")) return;
313  }
314  while(is_blank(c) || c == '\n');
315
316  if(c == '{')
317  {
318    size_t i = 1;
319    do
320    {
321      c = my_buffer->get();
322      if (check_end_of_instruction(my_buffer, c, "")) return;
323      if(c == '{') ++i;
324      else if(c == '}') --i;
325    } while(i > 0);
326  }
327  else
328  {
329    while(true)
330    {
331      if (check_end_of_instruction(my_buffer, c, "}\n")) return;
332      c = my_buffer->get();
333    }
334  }
335}
336
337void Lexer::skip_declspec()
338{
339  char c;
340  do
341  {
342    c = my_buffer->get();
343    if (check_end_of_instruction(my_buffer, c, "")) return;
344  } while(is_blank(c));
345
346  if (c == '(')
347  {
348    size_t i = 1;
349    do
350    {
351      c = my_buffer->get();
352      if (check_end_of_instruction(my_buffer, c, "};")) return;
353      if(c == '(') ++i;
354      else if(c == ')') --i;
355    } while(i > 0);
356  }
357}
358
359void Lexer::skip_pragma()
360{
361  char c = get_next_non_white_char();
362
363  if (c == '(')
364  {
365    size_t i = 1;
366    do
367    {
368      c = my_buffer->get();
369      if (check_end_of_instruction(my_buffer, c, "};")) return;
370      if(c == '(') ++i;
371      else if(c == ')') --i;
372    } while(i > 0);
373
374    c = get_next_non_white_char(); // assume ';'
375  }
376}
377
378char Lexer::get_next_non_white_char()
379{
380  char c;
381  while(true)
382  {
383    do { c = my_buffer->get();}
384    while(is_blank(c));
385
386    if(c != '\\') break;
387
388    c = my_buffer->get();
389    if(c != '\n' && c!= '\r') 
390    {
391      my_buffer->unget();
392      break;
393    }
394  }
395  return c;
396}
397
398Token::Type Lexer::read_line()
399{
400  char c = get_next_non_white_char();
401  unsigned long top = my_buffer->position();
402  my_token.ptr = my_buffer->ptr(top);
403  if(c == '\0')
404  {
405    my_buffer->unget();
406    return '\0';
407  }
408  else if(c == '\n') return '\n';
409  else if(c == '#' && my_token.type == '\n')
410  {
411    skip_line();
412    return '\n';
413  }
414  else if(c == '\'' || c == '"')
415  {
416    if(c == '\'')
417    {
418      if(read_char_const(top)) return Token::CharConst;
419    }
420    else
421    {
422      if(read_str_const(top)) return Token::StringL;
423    }
424    my_buffer->reset(top + 1);
425    my_token.length = 1;
426    return single_char_op(c);
427  }
428  else if(is_digit(c)) return read_number(c, top);
429  else if(c == '.')
430  {
431    c = my_buffer->get();
432    if(is_digit(c)) return read_float(top);
433    else
434    {
435      my_buffer->unget();
436      return read_separator('.', top);
437    }
438  }
439  else if(is_letter(c))
440  {
441    if (c == 'L')
442    {
443      c = my_buffer->get();
444      if (c == '\'' || c == '"')
445      {
446	if (c == '\'')
447	{
448	  if (read_char_const(top+1))
449	  {
450	    ++my_token.length;
451	    return Token::WideCharConst;
452	  }
453	} 
454	else
455	{
456	  if(read_str_const(top+1))
457	  {
458	    ++my_token.length;
459	    return Token::WideStringL;
460	  }
461	}
462      }
463      my_buffer->reset(top);
464    }
465    return read_identifier(top);
466  }
467  else return read_separator(c, top);
468}
469
470bool Lexer::read_char_const(unsigned long top)
471{
472  while(true)
473  {
474    char c = my_buffer->get();
475    if(c == '\\')
476    {
477      c = my_buffer->get();
478      if(c == '\0') return false;
479    }
480    else if(c == '\'')
481    {
482      my_token.length = static_cast<size_t>(my_buffer->position() - top + 1);
483      return true;
484    }
485    else if(c == '\n' || c == '\0') return false;
486  }
487}
488
489/*
490  If text is a sequence of string constants like:
491	"string1" "string2"  L"string3"
492  then the string constants are delt with as a single constant.
493*/
494bool Lexer::read_str_const(unsigned long top)
495{
496  // Skip the L if there is one
497  if (my_buffer->at(top) == 'L') my_buffer->get();
498  while(true)
499  {
500    char c = my_buffer->get();
501    if(c == '\\')
502    {
503      c = my_buffer->get();
504      if(c == '\0') return false;
505    }
506    else if(c == '"')
507    {
508      // We are past one string literal token now.
509      // Any following whitespace needs to be skipped
510      // before looking for anything else.
511      unsigned long pos = my_buffer->position() + 1;
512      while (true)
513      {
514	int nline = 0;
515	// Consume whitespace.
516	do
517	{
518	  c = my_buffer->get();
519	  if(c == '\n') ++nline;
520	} while(is_blank(c) || c == '\n');
521	// Consume comment.
522	if (c == '/')
523	{
524	  char d = my_buffer->get();
525	  if (d == '/' || d == '*')
526	    read_comment(d, my_buffer->position() - 2);
527	  else
528	  {
529	    my_buffer->unget();
530	    break;
531	  }
532	}
533	else break;
534      }
535      if(c == '"')
536	/* line_number += nline; */ ;
537      else
538      {
539	my_token.length = static_cast<size_t>(pos - top);
540	my_buffer->reset(pos);
541	return true;
542      }
543    }
544    else if(c == '\n' || c == '\0') return false;
545  }
546}
547
548Token::Type Lexer::read_number(char c, unsigned long top)
549{
550  char c2 = my_buffer->get();
551
552  if(c == '0' && is_xletter(c2))
553  {
554    do { c = my_buffer->get();}
555    while(is_hexdigit(c));
556    while(is_int_suffix(c)) c = my_buffer->get();
557
558    my_buffer->unget();
559    my_token.length = static_cast<size_t>(my_buffer->position() - top + 1);
560    return Token::Constant;
561  }
562
563  while(is_digit(c2)) c2 = my_buffer->get();
564
565  if(is_int_suffix(c2))
566    do { c2 = my_buffer->get();}
567    while(is_int_suffix(c2));
568  else if(c2 == '.') return read_float(top);
569  else if(is_eletter(c2))
570  {
571    my_buffer->unget();
572    return read_float(top);
573  }
574
575  my_buffer->unget();
576  my_token.length = static_cast<size_t>(my_buffer->position() - top + 1);
577  return Token::Constant;
578}
579
580Token::Type Lexer::read_float(unsigned long top)
581{
582  char c;
583    
584  do { c = my_buffer->get();}
585  while(is_digit(c));
586  if(is_float_suffix(c))
587    do { c = my_buffer->get();}
588    while(is_float_suffix(c));
589  else if(is_eletter(c))
590  {
591    unsigned long p = my_buffer->position();
592    c = my_buffer->get();
593    if(c == '+' || c == '-')
594    {
595      c = my_buffer->get();
596      if(!is_digit(c))
597      {
598