CBMC
smt2_tokenizer.cpp
Go to the documentation of this file.
1 /*******************************************************************\
2 
3 Module:
4 
5 Author: Daniel Kroening, kroening@kroening.com
6 
7 \*******************************************************************/
8 
9 #include "smt2_tokenizer.h"
10 
12 {
13  // any non-empty sequence of letters, digits and the characters
14  // ~ ! @ $ % ^ & * _ - + = < > . ? /
15  // that does not start with a digit and is not a reserved word.
16 
17  return isalnum(ch) ||
18  ch=='~' || ch=='!' || ch=='@' || ch=='$' || ch=='%' ||
19  ch=='^' || ch=='&' || ch=='*' || ch=='_' || ch=='-' ||
20  ch=='+' || ch=='=' || ch=='<' || ch=='>' || ch=='.' ||
21  ch=='?' || ch=='/';
22 }
23 
25 {
26  // any non-empty sequence of letters, digits and the characters
27  // ~ ! @ $ % ^ & * _ - + = < > . ? /
28  // that does not start with a digit and is not a reserved word.
29 
30  buffer.clear();
31 
32  char ch;
33  while(in->get(ch))
34  {
36  {
37  buffer+=ch;
38  }
39  else
40  {
41  in->unget(); // put back
42  quoted_symbol = false;
43  return SYMBOL;
44  }
45  }
46 
47  // eof -- this is ok here
48  if(buffer.empty())
49  return END_OF_FILE;
50  else
51  {
52  quoted_symbol = false;
53  return SYMBOL;
54  }
55 }
56 
58 {
59  // we accept any sequence of digits and dots
60 
61  buffer.clear();
62 
63  char ch;
64  while(in->get(ch))
65  {
66  if(isdigit(ch) || ch=='.')
67  {
68  buffer+=ch;
69  }
70  else
71  {
72  in->unget(); // put back
73  return NUMERAL;
74  }
75  }
76 
77  // eof -- this is ok here
78  if(buffer.empty())
79  return END_OF_FILE;
80  else
81  return NUMERAL;
82 }
83 
85 {
86  // we accept any sequence of '0' or '1'
87 
88  buffer.clear();
89  buffer+='#';
90  buffer+='b';
91 
92  char ch;
93  while(in->get(ch))
94  {
95  if(ch=='0' || ch=='1')
96  {
97  buffer+=ch;
98  }
99  else
100  {
101  in->unget(); // put back
102  return NUMERAL;
103  }
104  }
105 
106  // eof -- this is ok here
107  if(buffer.empty())
108  return END_OF_FILE;
109  else
110  return NUMERAL;
111 }
112 
114 {
115  // we accept any sequence of '0'-'9', 'a'-'f', 'A'-'F'
116 
117  buffer.clear();
118  buffer+='#';
119  buffer+='x';
120 
121  char ch;
122  while(in->get(ch))
123  {
124  if(isxdigit(ch))
125  {
126  buffer+=ch;
127  }
128  else
129  {
130  in->unget(); // put back
131  return NUMERAL;
132  }
133  }
134 
135  // eof -- this is ok here
136  if(buffer.empty())
137  return END_OF_FILE;
138  else
139  return NUMERAL;
140 }
141 
143 {
144  // any sequence of printable ASCII characters (including space,
145  // tab, and line-breaking characters) except for the backslash
146  // character \, that starts and ends with | and does not otherwise
147  // contain |
148 
149  buffer.clear();
150 
151  char ch;
152  while(in->get(ch))
153  {
154  if(ch=='|')
155  {
156  quoted_symbol = true;
157  return SYMBOL; // done
158  }
159 
160  buffer+=ch;
161 
162  if(ch=='\n')
163  line_no++;
164  }
165 
166  // Hmpf. Eof before end of quoted symbol. This is an error.
167  throw error("EOF within quoted symbol");
168 }
169 
171 {
172  buffer.clear();
173 
174  char ch;
175  while(in->get(ch))
176  {
177  if(ch=='"')
178  {
179  // quotes may be escaped by repeating
180  if(in->get(ch))
181  {
182  if(ch=='"')
183  {
184  }
185  else
186  {
187  in->unget();
188  return STRING_LITERAL; // done
189  }
190  }
191  else
192  return STRING_LITERAL; // done
193  }
194  buffer+=ch;
195  }
196 
197  // Hmpf. Eof before end of string literal. This is an error.
198  throw error("EOF within string literal");
199 }
200 
202 {
203  if(peeked)
204  peeked = false;
205  else
207 
208  return token;
209 }
210 
212 {
213  char ch;
214 
215  while(in->get(ch))
216  {
217  switch(ch)
218  {
219  case '\n':
220  line_no++;
221  break;
222 
223  case ' ':
224  case '\r':
225  case '\t':
226  case static_cast<char>(160): // non-breaking space
227  // skip any whitespace
228  break;
229 
230  case ';': // comment
231  // skip until newline
232  while(in->get(ch))
233  {
234  if(ch=='\n')
235  {
236  line_no++;
237  break;
238  }
239  }
240  break;
241 
242  case '(':
243  // produce sub-expression
244  token = OPEN;
245  return;
246 
247  case ')':
248  // done with sub-expression
249  token = CLOSE;
250  return;
251 
252  case '|': // quoted symbol
254  return;
255 
256  case '"': // string literal
258  return;
259 
260  case ':': // keyword
262  if(token == SYMBOL)
263  {
264  token = KEYWORD;
265  return;
266  }
267  else
268  throw error("expecting symbol after colon");
269 
270  case '#':
271  if(in->get(ch))
272  {
273  if(ch=='b')
274  {
276  return;
277  }
278  else if(ch=='x')
279  {
281  return;
282  }
283  else
284  throw error("unknown numeral token");
285  }
286  else
287  throw error("unexpected EOF in numeral token");
288  break;
289 
290  default: // likely a simple symbol or a numeral
291  if(isdigit(ch))
292  {
293  in->unget();
295  return;
296  }
298  {
299  in->unget();
301  return;
302  }
303  else
304  {
305  // illegal character, error
306  throw error() << "unexpected character '" << ch << '\'';
307  }
308  }
309  }
310 
311  token = END_OF_FILE;
312 }
void get_token_from_stream()
read a token from the input stream and store it in 'token'
tokent get_string_literal()
tokent get_decimal_numeral()
enum { NONE, END_OF_FILE, STRING_LITERAL, NUMERAL, SYMBOL, KEYWORD, OPEN, CLOSE } tokent
std::istream * in
smt2_errort error() const
generate an error exception
tokent get_bin_numeral()
tokent get_simple_symbol()
tokent get_quoted_symbol()
std::string buffer
tokent get_hex_numeral()
int isdigit(int c)
Definition: ctype.c:24
int isxdigit(int c)
Definition: ctype.c:95
int isalnum(int c)
Definition: ctype.c:4
bool is_smt2_simple_symbol_character(char ch)